diff --git a/.bazelrc b/.bazelrc index 44cb9ec2dcb40..c363481091e6f 100644 --- a/.bazelrc +++ b/.bazelrc @@ -9,6 +9,10 @@ build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_ti build:clang-tidy --output_groups=report build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config +# This warning seems to incorrectly fire in this build configuration, despite +# not firing in our normal builds. +build:clang-tidy --copt=-Wno-unknown-pragmas + # Default to using a disk cache to minimize re-building LLVM and Clang which we # try to avoid updating too frequently to minimize rebuild cost. The location # here can be overridden in the user configuration where needed. diff --git a/.codespell_ignore b/.codespell_ignore index 309736ef36f24..a47281918894d 100644 --- a/.codespell_ignore +++ b/.codespell_ignore @@ -11,6 +11,7 @@ createor crossreference falsy forin +groupt inout parameteras pullrequest diff --git a/common/BUILD b/common/BUILD index debe86d58a845..9ba9096918280 100644 --- a/common/BUILD +++ b/common/BUILD @@ -182,6 +182,14 @@ cc_binary( ], ) +cc_library( + name = "hashtable_key_context", + hdrs = ["hashtable_key_context.h"], + deps = [ + ":hashing", + ], +) + cc_library( name = "indirect_value", hdrs = ["indirect_value.h"], @@ -224,6 +232,53 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "map", + hdrs = ["map.h"], + deps = [ + ":check", + ":hashtable_key_context", + ":raw_hashtable", + "@llvm-project//llvm:Support", + ], +) + +cc_test( + name = "map_test", + srcs = ["map_test.cpp"], + deps = [ + ":map", + ":raw_hashtable_test_helpers", + "//testing/base:gtest_main", + "//testing/base:test_raw_ostream", + "@googletest//:gtest", + ], +) + +cc_binary( + name = "map_benchmark", + testonly = 1, + srcs = ["map_benchmark.cpp"], + deps = [ + ":map", + ":raw_hashtable_benchmark_helpers", + "@abseil-cpp//absl/container:flat_hash_map", + "@abseil-cpp//absl/random", + "@google_benchmark//:benchmark_main", + "@llvm-project//llvm:Support", + ], +) + +sh_test( + name = "map_benchmark_test", + # The benchmark allocates a large amount of memory. + size = "enormous", + # We configure the test to run quickly. + timeout = "short", + srcs = ["map_benchmark_test.sh"], + data = [":map_benchmark"], +) + cc_library( name = "ostream", hdrs = ["ostream.h"], @@ -232,6 +287,126 @@ cc_library( ], ) +cc_library( + name = "raw_hashtable", + srcs = ["raw_hashtable.cpp"], + hdrs = ["raw_hashtable.h"], + deps = [ + ":check", + ":hashing", + ":hashtable_key_context", + ":raw_hashtable_metadata_group", + "@llvm-project//llvm:Support", + ], +) + +cc_library( + name = "raw_hashtable_metadata_group", + srcs = ["raw_hashtable_metadata_group.cpp"], + hdrs = ["raw_hashtable_metadata_group.h"], + deps = [ + ":check", + "@llvm-project//llvm:Support", + ], +) + +cc_binary( + name = "raw_hashtable_metadata_group_benchmark", + testonly = 1, + srcs = ["raw_hashtable_metadata_group_benchmark.cpp"], + deps = [ + ":raw_hashtable_metadata_group", + "@abseil-cpp//absl/random", + "@google_benchmark//:benchmark_main", + "@llvm-project//llvm:Support", + ], +) + +sh_test( + name = "raw_hashtable_metadata_group_benchmark_test", + srcs = ["raw_hashtable_metadata_group_benchmark_test.sh"], + data = [":raw_hashtable_metadata_group_benchmark"], +) + +cc_library( + name = "raw_hashtable_benchmark_helpers", + testonly = 1, + srcs = ["raw_hashtable_benchmark_helpers.cpp"], + hdrs = ["raw_hashtable_benchmark_helpers.h"], + copts = [ + "-O2", # Always optimize to make testing benchmarks faster. + ], + deps = [ + ":check", + ":hashing", + ":raw_hashtable", + ":set", + "@abseil-cpp//absl/base:no_destructor", + "@abseil-cpp//absl/hash", + "@abseil-cpp//absl/random", + "@google_benchmark//:benchmark", + "@llvm-project//llvm:Support", + ], +) + +cc_library( + name = "raw_hashtable_test_helpers", + testonly = 1, + hdrs = ["raw_hashtable_test_helpers.h"], + deps = [ + ":check", + ":hashing", + ":hashtable_key_context", + ":ostream", + ], +) + +cc_library( + name = "set", + hdrs = ["set.h"], + deps = [ + ":check", + ":hashtable_key_context", + ":raw_hashtable", + "@llvm-project//llvm:Support", + ], +) + +cc_test( + name = "set_test", + srcs = ["set_test.cpp"], + deps = [ + ":raw_hashtable_test_helpers", + ":set", + "//testing/base:gtest_main", + "//testing/base:test_raw_ostream", + "@googletest//:gtest", + ], +) + +cc_binary( + name = "set_benchmark", + testonly = 1, + srcs = ["set_benchmark.cpp"], + deps = [ + ":raw_hashtable_benchmark_helpers", + ":set", + "@abseil-cpp//absl/container:flat_hash_set", + "@google_benchmark//:benchmark_main", + "@llvm-project//llvm:Support", + ], +) + +sh_test( + name = "set_benchmark_test", + # The benchmark allocates a large amount of memory. + size = "enormous", + # We configure the test to run quickly. + timeout = "short", + srcs = ["set_benchmark_test.sh"], + data = [":set_benchmark"], +) + cc_library( name = "string_helpers", srcs = ["string_helpers.cpp"], diff --git a/common/hashing.h b/common/hashing.h index 2e8edfdc2671b..e4a0f27de6f63 100644 --- a/common/hashing.h +++ b/common/hashing.h @@ -573,9 +573,9 @@ constexpr auto HashCode::ExtractIndex() -> ssize_t { return value_; } template constexpr auto HashCode::ExtractIndexAndTag() -> std::pair { static_assert(N >= 1); - static_assert(N <= 32); + static_assert(N < 32); return {static_cast(value_ >> N), - static_cast(value_ & ((1U << (N + 1)) - 1))}; + static_cast(value_ & ((1U << N) - 1))}; } // Building with `-DCARBON_MCA_MARKERS` will enable `llvm-mca` annotations in diff --git a/common/hashing_test.cpp b/common/hashing_test.cpp index 23553901c11d1..01ad3a792b365 100644 --- a/common/hashing_test.cpp +++ b/common/hashing_test.cpp @@ -40,6 +40,12 @@ TEST(HashingTest, HashCodeAPI) { EXPECT_THAT(a.ExtractIndex(), Ne(b.ExtractIndex())); EXPECT_THAT(a.ExtractIndex(), Ne(empty.ExtractIndex())); + // The tag shouldn't have bits set outside the range requested. + EXPECT_THAT(HashValue("a").ExtractIndexAndTag<1>().second & ~0b1, Eq(0)); + EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>().second & ~0b11, Eq(0)); + EXPECT_THAT(HashValue("a").ExtractIndexAndTag<3>().second & ~0b111, Eq(0)); + EXPECT_THAT(HashValue("a").ExtractIndexAndTag<4>().second & ~0b1111, Eq(0)); + // Note that the index produced with a tag may be different from the index // alone! EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>(), diff --git a/common/hashtable_key_context.h b/common/hashtable_key_context.h new file mode 100644 index 0000000000000..4fb3a28dc3142 --- /dev/null +++ b/common/hashtable_key_context.h @@ -0,0 +1,85 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_ +#define CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_ + +#include "common/hashing.h" + +namespace Carbon { + +// Customizable context for keys in hashtables. +// +// This type or customizations matching its API are used with the data +// structures in `map.h` and `set.h`. By providing a custom version of the +// `KeyContext` type parameter to those data structures, users can provide +// either stateless or stateful customization of the two core hashtable key +// operations: hashing and comparison. +// +// The default for hashing uses Carbon's `hashing.h`. Customizations must still +// return a `HashCode` as defined there, and it needs to have the same core +// properties of hashes produced by the `hashing.h` infrastructure. +// +// The default for comparison is `operator==`. The `KeyEq` method is always +// called with a key *stored in the hashtable* as the second or "RHS" parameter. +// This is to allow simplifying the set of overloads needed for heterogeneous +// contexts: only the first, LHS, parameter needs to support different lookup +// key types. +// +// Custom KeyContext types should have the the same API as the default type. +// They can choose to use templates to support heterogeneous key types or not as +// appropriate. The default context can also be used as a base class with only +// one or the other APIs customized. +// +// An important consideration is how the key context is constructed. When the +// key context can be default constructed, hashtable APIs trafficking in keys +// will have overloads that provide a default constructed key context. When the +// context is *not* default constructible, every API that accepts a key will +// also require a context argument to be called, and that argument will be used +// throughout that operation. The intent is to allow callers to provide stateful +// contexts to each API where it would be needed, while managing that state +// outside the hashtable. Often the needed state is trivially part of the +// caller's existing state and needn't be stored separately. +// +// Example for a stateful, customized key context for interned strings: +// ```cpp +// class InternedStringIndexKeyContext { +// public: +// InternedStringIndexKeyContext( +// llvm::ArrayRef interned_strings) +// : interned_strings_(interned_strings) {} +// +// auto HashKey(llvm::StringRef s, uint64_t seed) const -> HashCode { +// return HashValue(s); +// } +// auto HashKey(int index_key, uint64_t seed) const -> HashCode { +// return HashKey(interned_strings_[index_key]); +// } +// +// auto KeyEq(llvm::StringRef lhs, int rhs_index) const -> bool { +// return lhs == interned_strings_[rhs_index]; +// } +// auto KeyEq(int lhs_index, int rhs_index) const -> bool { +// return KeyEq(interned_strings_[lhs_index], rhs_index); +// } +// +// private: +// llvm::ArrayRef interned_strings_; +// }; +// ``` +struct DefaultKeyContext { + template + auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode { + return HashValue(key, seed); + } + + template + auto KeyEq(const LHSKeyT& lhs_key, const RHSKeyT& rhs_key) const -> bool { + return lhs_key == rhs_key; + } +}; + +} // namespace Carbon + +#endif // CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_ diff --git a/common/map.h b/common/map.h new file mode 100644 index 0000000000000..5922cf9a342ba --- /dev/null +++ b/common/map.h @@ -0,0 +1,558 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_MAP_H_ +#define CARBON_COMMON_MAP_H_ + +#include +#include +#include + +#include "common/check.h" +#include "common/hashtable_key_context.h" +#include "common/raw_hashtable.h" +#include "llvm/Support/Compiler.h" + +namespace Carbon { + +// Forward declarations to resolve cyclic references. +template +class MapView; +template +class MapBase; +template +class Map; + +// A read-only view type for a map from key to value. +// +// This view is a cheap-to-copy type that should be passed by value, but +// provides view or read-only reference semantics to the underlying map data +// structure. +// +// This should always be preferred to a `const`-ref parameter for the `MapBase` +// or `Map` type as it provides more flexibility and a cleaner API. +// +// Note that while this type is a read-only view, that applies to the underlying +// *map* data structure, not the individual entries stored within it. Those can +// be mutated freely as long as both the hashes and equality of the keys are +// preserved. If we applied a deep-`const` design here, it would prevent using +// this type in many useful situations where the elements are mutated but the +// associative container is not. A view of immutable data can always be obtained +// by using `MapView`, and we enable conversions to more-const +// views. This mirrors the semantics of views like `std::span`. +// +// A specific `KeyContextT` type can optionally be provided to configure how +// keys will be hashed and compared. The default is `DefaultKeyContext` which is +// stateless and will hash using `Carbon::HashValue` and compare using +// `operator==`. Every method accepting a lookup key or operating on the keys in +// the table will also accept an instance of this type. For stateless context +// types, including the default, an instance will be default constructed if not +// provided to these methods. However, stateful contexts should be constructed +// and passed in explicitly. The context type should be small and reasonable to +// pass by value, often a wrapper or pointer to the relevant context needed for +// hashing and comparing keys. For more details about the key context, see +// `hashtable_key_context.h`. +template +class MapView + : RawHashtable::ViewImpl { + using ImplT = + RawHashtable::ViewImpl; + using EntryT = typename ImplT::EntryT; + + public: + using KeyT = typename ImplT::KeyT; + using ValueT = typename ImplT::ValueT; + using KeyContextT = typename ImplT::KeyContextT; + + // This type represents the result of lookup operations. It encodes whether + // the lookup was a success as well as accessors for the key and value. + class LookupKVResult { + public: + LookupKVResult() = default; + explicit LookupKVResult(EntryT* entry) : entry_(entry) {} + + explicit operator bool() const { return entry_ != nullptr; } + + auto key() const -> KeyT& { return entry_->key(); } + auto value() const -> ValueT& { return entry_->value(); } + + private: + EntryT* entry_ = nullptr; + }; + + // Enable implicit conversions that add `const`-ness to either key or value + // type. This is always safe to do with a view. We use a template to avoid + // needing all 3 versions. + template + // NOLINTNEXTLINE(google-explicit-constructor) + MapView(MapView other_view) + requires(std::same_as || + std::same_as) && + (std::same_as || + std::same_as) + : ImplT(other_view) {} + + // Tests whether a key is present in the map. + template + auto Contains(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> bool; + + // Lookup a key in the map. + template + auto Lookup(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> LookupKVResult; + + // Lookup a key in the map and try to return a pointer to its value. Returns + // null on a missing key. + template + auto operator[](LookupKeyT lookup_key) const + -> ValueT* requires(std::default_initializable); + + // Run the provided callback for every key and value in the map. + template + void ForEach(CallbackT callback) + requires(std::invocable); + + // This routine is relatively inefficient and only intended for use in + // benchmarking or logging of performance anomalies. The specific count + // returned has no specific guarantees beyond being informative in benchmarks. + // It counts how many of the keys in the hashtable have required probing + // beyond their initial group of slots. + // + // TODO: Replace with a more general metrics routine that covers other + // important aspects such as load factor, and average probe *distance*. + auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t { + return ImplT::CountProbedKeys(key_context); + } + + private: + template + friend class Map; + friend class MapBase; + friend class MapView; + friend class MapView; + friend class MapView; + + MapView() = default; + // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design. + MapView(ImplT base) : ImplT(base) {} + MapView(ssize_t size, RawHashtable::Storage* storage) + : ImplT(size, storage) {} +}; + +// A base class for a `Map` type that remains mutable while type-erasing the +// `SmallSize` (SSO) template parameter. +// +// A pointer or reference to this type is the preferred way to pass a mutable +// handle to a `Map` type across API boundaries as it avoids encoding specific +// SSO sizing information while providing a near-complete mutable API. +template +class MapBase : protected RawHashtable::BaseImpl { + protected: + using ImplT = + RawHashtable::BaseImpl; + using EntryT = typename ImplT::EntryT; + + public: + using KeyT = typename ImplT::KeyT; + using ValueT = typename ImplT::ValueT; + using KeyContextT = typename ImplT::KeyContextT; + using ViewT = MapView; + using LookupKVResult = typename ViewT::LookupKVResult; + + // The result type for insertion operations both indicates whether an insert + // was needed (as opposed to finding an existing element), and provides access + // to the element's key and value. + class InsertKVResult { + public: + InsertKVResult() = default; + explicit InsertKVResult(bool inserted, EntryT& entry) + : entry_(&entry), inserted_(inserted) {} + + auto is_inserted() const -> bool { return inserted_; } + + auto key() const -> KeyT& { return entry_->key(); } + auto value() const -> ValueT& { return entry_->value(); } + + private: + EntryT* entry_; + bool inserted_; + }; + + // Implicitly convertible to the relevant view type. + // + // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay. + operator ViewT() const { return this->view_impl(); } + + // We can't chain the above conversion with the conversions on `ViewT` to add + // const, so explicitly support adding const to produce a view here. + template + // NOLINTNEXTLINE(google-explicit-constructor) + operator MapView() const + requires(std::same_as || + std::same_as) && + (std::same_as || + std::same_as) + { + return ViewT(*this); + } + + // Convenience forwarder to the view type. + template + auto Contains(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> bool { + return ViewT(*this).Contains(lookup_key, key_context); + } + + // Convenience forwarder to the view type. + template + auto Lookup(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> LookupKVResult { + return ViewT(*this).Lookup(lookup_key, key_context); + } + + // Convenience forwarder to the view type. + template + auto operator[](LookupKeyT lookup_key) const + -> ValueT* requires(std::default_initializable) { + return ViewT(*this)[lookup_key]; + } + + // Convenience forwarder to the view type. + template + void ForEach(CallbackT callback) + requires(std::invocable) + { + return ViewT(*this).ForEach(callback); + } + + // Convenience forwarder to the view type. + auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const + -> ssize_t { + return ViewT(*this).CountProbedKeys(key_context); + } + + // Insert a key and value into the map. If the key is already present, the new + // value is discarded and the existing value preserved. + template + auto Insert(LookupKeyT lookup_key, ValueT new_v, + KeyContextT key_context = KeyContextT()) -> InsertKVResult; + + // Insert a key into the map and call the provided callback if necessary to + // produce a new value when no existing value is found. + // + // Example: `m.Insert(key, [] { return default_value; });` + // + // TODO: The `;` formatting below appears to be bugs in clang-format with + // concepts that should be filed upstream. + template + auto Insert(LookupKeyT lookup_key, ValueCallbackT value_cb, + KeyContextT key_context = KeyContextT()) -> InsertKVResult + requires( + !std::same_as && + std::convertible_to()()), ValueT>) + ; + + // Lookup a key in the map and if missing insert it and call the provided + // callback to in-place construct both the key and value. The lookup key is + // passed through to the callback so it needn't be captured and can be kept in + // a register argument throughout. + // + // Example: + // ```cpp + // m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage, + // void* value_storage) { + // new (key_storage) MyStringType(lookup_key); + // new (value_storage) MyValueType(....); + // }); + // ``` + template + auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb, + KeyContextT key_context = KeyContextT()) -> InsertKVResult + requires(!std::same_as && + std::invocable); + + // Replace a key's value in a map if already present or insert it if not + // already present. The new value is always used. + template + auto Update(LookupKeyT lookup_key, ValueT new_v, + KeyContextT key_context = KeyContextT()) -> InsertKVResult; + + // Lookup or insert a key into the map, and set it's value to the result of + // the `value_cb` callback. The callback is always run and its result is + // always used, whether the key was already in the map or not. Any existing + // value is replaced with the result. + // + // Example: `m.Update(key, [] { return new_value; });` + template + auto Update(LookupKeyT lookup_key, ValueCallbackT value_cb, + KeyContextT key_context = KeyContextT()) -> InsertKVResult + requires( + !std::same_as && + std::convertible_to()()), ValueT>) + ; + + // Lookup or insert a key into the map. If not already present and the key is + // inserted, the `insert_cb` is used to construct the new key and value in + // place. When inserting, the lookup key is passed through to the callback so + // it needn't be captured and can be kept in a register argument throughout. + // If the key was already present, the `update_cb` is called to update the + // existing key and value as desired. + // + // Example of counting occurrences: + // ```cpp + // m.Update(item, /*insert_cb=*/[](MyStringViewType lookup_key, + // void* key_storage, void* value_storage) { + // new (key_storage) MyItem(lookup_key); + // new (value_storage) Count(1); + // }, + // /*update_cb=*/[](MyItem& /*key*/, Count& count) { + // ++count; + // }); + // ``` + template + auto Update(LookupKeyT lookup_key, InsertCallbackT insert_cb, + UpdateCallbackT update_cb, + KeyContextT key_context = KeyContextT()) -> InsertKVResult + requires(!std::same_as && + std::invocable && + std::invocable); + + // Erase a key from the map. + template + auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT()) + -> bool; + + // Clear all key/value pairs from the map but leave the underlying hashtable + // allocated and in place. + void Clear(); + + protected: + using ImplT::ImplT; +}; + +// A data structure mapping from key to value. +// +// This map also supports small size optimization (or "SSO"). The provided +// `SmallSize` type parameter indicates the size of an embedded buffer for +// storing maps small enough to fit. The default is zero, which always allocates +// a heap buffer on construction. When non-zero, must be a multiple of the +// `MaxGroupSize` which is currently 16. The library will check that the size is +// valid and provide an error at compile time if not. We don't automatically +// select the next multiple or otherwise fit the size to the constraints to make +// it clear in the code how much memory is used by the SSO buffer. +// +// This data structure optimizes heavily for small key types that are cheap to +// move and even copy. Using types with large keys or expensive to copy keys may +// create surprising performance bottlenecks. A `std::string` key should be fine +// with generally small strings, but if some or many strings are large heap +// allocations the performance of hashtable routines may be unacceptably bad and +// another data structure or key design is likely preferable. +// +// Note that this type should typically not appear on API boundaries; either +// `MapBase` or `MapView` should be used instead. +template +class Map : public RawHashtable::TableImpl< + MapBase, SmallSize> { + using BaseT = MapBase; + using ImplT = RawHashtable::TableImpl; + + public: + using KeyT = typename BaseT::KeyT; + using ValueT = typename BaseT::ValueT; + + Map() = default; + Map(const Map& arg) = default; + Map(Map&& arg) noexcept = default; + + // Reset the entire state of the hashtable to as it was when constructed, + // throwing away any intervening allocations. + void Reset(); +}; + +template +template +auto MapView::Contains( + LookupKeyT lookup_key, KeyContextT key_context) const -> bool { + return this->LookupEntry(lookup_key, key_context) != nullptr; +} + +template +template +auto MapView::Lookup( + LookupKeyT lookup_key, KeyContextT key_context) const -> LookupKVResult { + return LookupKVResult(this->LookupEntry(lookup_key, key_context)); +} + +template +template +auto MapView::operator[]( + LookupKeyT lookup_key) const + -> ValueT* requires(std::default_initializable) { + auto result = Lookup(lookup_key, KeyContextT()); + return result ? &result.value() : nullptr; + } + +template +template +void MapView::ForEach( + CallbackT callback) + requires(std::invocable) +{ + this->ForEachEntry( + [callback](EntryT& entry) { callback(entry.key(), entry.value()); }, + [](auto...) {}); +} + +template +template +[[clang::always_inline]] auto +MapBase::Insert( + LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context) + -> InsertKVResult { + return Insert( + lookup_key, + [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) { + new (key_storage) KeyT(lookup_key); + new (value_storage) ValueT(std::move(new_v)); + }, + key_context); +} + +template +template +[[clang::always_inline]] auto +MapBase::Insert( + LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context) + -> InsertKVResult + requires( + !std::same_as && + std::convertible_to()()), ValueT>) +{ + return Insert( + lookup_key, + [&value_cb](LookupKeyT lookup_key, void* key_storage, + void* value_storage) { + new (key_storage) KeyT(lookup_key); + new (value_storage) ValueT(value_cb()); + }, + key_context); +} + +template +template +[[clang::always_inline]] auto +MapBase::Insert( + LookupKeyT lookup_key, InsertCallbackT insert_cb, KeyContextT key_context) + -> InsertKVResult + requires(!std::same_as && + std::invocable) +{ + auto [entry, inserted] = this->InsertImpl(lookup_key, key_context); + CARBON_DCHECK(entry) << "Should always result in a valid index."; + + if (LLVM_LIKELY(!inserted)) { + return InsertKVResult(false, *entry); + } + + insert_cb(lookup_key, static_cast(&entry->key_storage), + static_cast(&entry->value_storage)); + return InsertKVResult(true, *entry); +} + +template +template +[[clang::always_inline]] auto +MapBase::Update( + LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context) + -> InsertKVResult { + return Update( + lookup_key, + [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) { + new (key_storage) KeyT(lookup_key); + new (value_storage) ValueT(std::move(new_v)); + }, + [&new_v](KeyT& /*key*/, ValueT& value) { + value.~ValueT(); + new (&value) ValueT(std::move(new_v)); + }, + key_context); +} + +template +template +[[clang::always_inline]] auto +MapBase::Update( + LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context) + -> InsertKVResult + requires( + !std::same_as && + std::convertible_to()()), ValueT>) +{ + return Update( + lookup_key, + [&value_cb](LookupKeyT lookup_key, void* key_storage, + void* value_storage) { + new (key_storage) KeyT(lookup_key); + new (value_storage) ValueT(value_cb()); + }, + [&value_cb](KeyT& /*key*/, ValueT& value) { + value.~ValueT(); + new (&value) ValueT(value_cb()); + }, + key_context); +} + +template +template +[[clang::always_inline]] auto +MapBase::Update( + LookupKeyT lookup_key, InsertCallbackT insert_cb, UpdateCallbackT update_cb, + KeyContextT key_context) -> InsertKVResult + requires(!std::same_as && + std::invocable && + std::invocable) +{ + auto [entry, inserted] = this->InsertImpl(lookup_key, key_context); + CARBON_DCHECK(entry) << "Should always result in a valid index."; + + if (LLVM_LIKELY(!inserted)) { + update_cb(entry->key(), entry->value()); + return InsertKVResult(false, *entry); + } + + insert_cb(lookup_key, static_cast(&entry->key_storage), + static_cast(&entry->value_storage)); + return InsertKVResult(true, *entry); +} + +template +template +auto MapBase::Erase( + LookupKeyT lookup_key, KeyContextT key_context) -> bool { + return this->EraseImpl(lookup_key, key_context); +} + +template +void MapBase::Clear() { + this->ClearImpl(); +} + +template +void Map::Reset() { + this->ResetImpl(); +} + +} // namespace Carbon + +#endif // CARBON_COMMON_MAP_H_ diff --git a/common/map_benchmark.cpp b/common/map_benchmark.cpp new file mode 100644 index 0000000000000..7c0ed92037339 --- /dev/null +++ b/common/map_benchmark.cpp @@ -0,0 +1,480 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include + +#include "absl/container/flat_hash_map.h" +#include "common/map.h" +#include "common/raw_hashtable_benchmark_helpers.h" +#include "llvm/ADT/DenseMap.h" + +namespace Carbon { +namespace { + +using RawHashtable::CarbonHashDI; +using RawHashtable::GetKeysAndHitKeys; +using RawHashtable::GetKeysAndMissKeys; +using RawHashtable::HitArgs; +using RawHashtable::SizeArgs; +using RawHashtable::ValueToBool; + +// Helpers to synthesize some value of one of the three types we use as value +// types. +template +auto MakeValue() -> T { + if constexpr (std::is_same_v) { + return "abc"; + } else if constexpr (std::is_pointer_v) { + static std::remove_pointer_t x; + return &x; + } else { + return 42; + } +} +template +auto MakeValue2() -> T { + if constexpr (std::is_same_v) { + return "qux"; + } else if constexpr (std::is_pointer_v) { + static std::remove_pointer_t y; + return &y; + } else { + return 7; + } +} + +template +struct IsCarbonMapImpl : std::false_type {}; +template +struct IsCarbonMapImpl> : std::true_type {}; + +template +static constexpr bool IsCarbonMap = IsCarbonMapImpl::value; + +// A wrapper around various map types that we specialize to implement a common +// API used in the benchmarks for various different map data structures that +// support different APIs. The primary template assumes a roughly +// `std::unordered_map` API design, and types with a different API design are +// supported through specializations. +template +struct MapWrapperImpl { + using KeyT = typename MapT::key_type; + using ValueT = typename MapT::mapped_type; + + MapT m; + + auto BenchContains(KeyT k) -> bool { return m.find(k) != m.end(); } + + auto BenchLookup(KeyT k) -> bool { + auto it = m.find(k); + if (it == m.end()) { + return false; + } + return ValueToBool(it->second); + } + + auto BenchInsert(KeyT k, ValueT v) -> bool { + auto result = m.insert({k, v}); + return result.second; + } + + auto BenchUpdate(KeyT k, ValueT v) -> bool { + auto result = m.insert({k, v}); + result.first->second = v; + return result.second; + } + + auto BenchErase(KeyT k) -> bool { return m.erase(k) != 0; } +}; + +// Explicit (partial) specialization for the Carbon map type that uses its +// different API design. +template +struct MapWrapperImpl> { + using MapT = Map; + using KeyT = KT; + using ValueT = VT; + + MapT m; + + auto BenchContains(KeyT k) -> bool { return m.Contains(k); } + + auto BenchLookup(KeyT k) -> bool { + auto result = m.Lookup(k); + if (!result) { + return false; + } + return ValueToBool(result.value()); + } + + auto BenchInsert(KeyT k, ValueT v) -> bool { + auto result = m.Insert(k, v); + return result.is_inserted(); + } + + auto BenchUpdate(KeyT k, ValueT v) -> bool { + auto result = m.Update(k, v); + return result.is_inserted(); + } + + auto BenchErase(KeyT k) -> bool { return m.Erase(k); } +}; + +// Provide a way to override the Carbon Map specific benchmark runs with another +// hashtable implementation. When building, you can use one of these enum names +// in a macro define such as `-DCARBON_MAP_BENCH_OVERRIDE=Name` in order to +// trigger a specific override for the `Map` type benchmarks. This is used to +// get before/after runs that compare the performance of Carbon's Map versus +// other implementations. +enum class MapOverride : uint8_t { + None, + Abseil, + LLVM, + LLVMAndCarbonHash, +}; +#ifndef CARBON_MAP_BENCH_OVERRIDE +#define CARBON_MAP_BENCH_OVERRIDE None +#endif + +template +struct MapWrapperOverride : MapWrapperImpl {}; + +template +struct MapWrapperOverride, MapOverride::Abseil> + : MapWrapperImpl> {}; + +template +struct MapWrapperOverride, MapOverride::LLVM> + : MapWrapperImpl> {}; + +template +struct MapWrapperOverride, + MapOverride::LLVMAndCarbonHash> + : MapWrapperImpl>> {}; + +template +using MapWrapper = + MapWrapperOverride; + +// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here. +#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT, VT) \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>>)->Apply(APPLY) +// NOLINTEND(bugprone-macro-parentheses) + +#define MAP_BENCHMARK_ONE_OP(NAME, APPLY) \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, int); \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*, int*); \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, llvm::StringRef); \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef, int) + +// Benchmark the minimal latency of checking if a key is contained within a map, +// when it *is* definitely in that map. Because this is only really measuring +// the *minimal* latency, it is more similar to a throughput benchmark. +// +// While this is structured to observe the latency of testing for presence of a +// key, it is important to understand the reality of what this measures. Because +// the boolean result testing for whether a key is in a map is fundamentally +// provided not by accessing some data, but by branching on data to a control +// flow path which sets the boolean to `true` or `false`, the result can be +// speculatively provided based on predicting the conditional branch without +// waiting for the results of the comparison to become available. And because +// this is a small operation and we arrange for all the candidate keys to be +// present, that branch *should* be predicted extremely well. The result is that +// this measures the un-speculated latency of testing for presence which should +// be small or zero. Which is why this is ultimately more similar to a +// throughput benchmark. +// +// Because of these measurement oddities, the specific measurements here may not +// be very interesting for predicting real-world performance in any way, but +// they are useful for comparing how 'cheap' the operation is across changes to +// the data structure or between similar data structures with similar +// properties. +template +static void BM_MapContainsHit(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + MapWrapperT m; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + m.BenchInsert(k, MakeValue()); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + // We block optimizing `i` as that has proven both more effective at + // blocking the loop from being optimized away and avoiding disruption of + // the generated code that we're benchmarking. + benchmark::DoNotOptimize(i); + + bool result = m.BenchContains(lookup_keys[i]); + CARBON_DCHECK(result); + // We use the lookup success to step through keys, establishing a + // dependency between each lookup. This doesn't fully allow us to measure + // latency rather than throughput, as noted above. + i += static_cast(result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_MapContainsHit, HitArgs); + +// Similar to `BM_MapContainsHit`, while this is structured as a latency +// benchmark, the critical path is expected to be well predicted and so it +// should turn into something closer to a throughput benchmark. +template +static void BM_MapContainsMiss(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + MapWrapperT m; + auto [keys, lookup_keys] = GetKeysAndMissKeys(state.range(0)); + for (auto k : keys) { + m.BenchInsert(k, MakeValue()); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + benchmark::DoNotOptimize(i); + + bool result = m.BenchContains(lookup_keys[i]); + CARBON_DCHECK(!result); + i += static_cast(!result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_MapContainsMiss, SizeArgs); + +// This is a genuine latency benchmark. We lookup a key in the hashtable and use +// the value associated with that key in the critical path of loading the next +// iteration's key. We still ensure the keys are always present, and so we +// generally expect the data structure branches to be well predicted. But we +// vary the keys aggressively to avoid any prediction artifacts from repeatedly +// examining the same key. +// +// This latency can be very helpful for understanding a range of data structure +// behaviors: +// - Many users of hashtables are directly dependent on the latency of this +// operation, and this micro-benchmark will reflect the expected latency for +// them. +// - Showing how latency varies across different sizes of table and different +// fractions of the table being accessed (and thus needing space in the +// cache). +// +// However, it remains an ultimately synthetic and unrepresentative benchmark. +// It should primarily be used to understand the relative cost of these +// operations between versions of the data structure or between related data +// structures. +// +// We vary both the number of entries in the table and the number of distinct +// keys used when doing lookups. As the table becomes large, the latter dictates +// the fraction of the table that will be accessed and thus the working set size +// of the benchmark. Querying the same small number of keys in even a large +// table doesn't actually encounter any cache pressure, so only a few of these +// benchmarks will show any effects of the caching subsystem. +template +static void BM_MapLookupHit(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + MapWrapperT m; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + m.BenchInsert(k, MakeValue()); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + benchmark::DoNotOptimize(i); + + bool result = m.BenchLookup(lookup_keys[i]); + CARBON_DCHECK(result); + i += static_cast(result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_MapLookupHit, HitArgs); + +// This is an update throughput benchmark in practice. While whether the key was +// a hit is kept in the critical path, we only use keys that are hits and so +// expect that to be fully predicted and speculated. +// +// However, we expect this fairly closely matches how user code interacts with +// an update-style API. It will have some conditional testing (even if just an +// assert) on whether the key was a hit and otherwise continue executing. As a +// consequence the actual update is expected to not be in a meaningful critical +// path. +// +// This still provides a basic way to measure the cost of this operation, +// especially when comparing between implementations or across different hash +// tables. +template +static void BM_MapUpdateHit(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + MapWrapperT m; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + m.BenchInsert(k, MakeValue()); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size; ++i) { + benchmark::DoNotOptimize(i); + + bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2()); + CARBON_DCHECK(!inserted); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_MapUpdateHit, HitArgs); + +// First erase and then insert the key. The code path will always be the same +// here and so we expect this to largely be a throughput benchmark because of +// branch prediction and speculative execution. +// +// We don't expect erase followed by insertion to be a common user code +// sequence, but we don't have a good way of benchmarking either erase or insert +// in isolation -- each would change the size of the table and thus the next +// iteration's benchmark. And if we try to correct the table size outside of the +// timed region, we end up trying to exclude too fine grained of a region from +// timers to get good measurement data. +// +// Our solution is to benchmark both erase and insertion back to back. We can +// then get a good profile of the code sequence of each, and at least measure +// the sum cost of these reliably. Careful profiling can help attribute that +// cost between erase and insert in order to understand which of the two +// operations is contributing most to any performance artifacts observed. +template +static void BM_MapEraseUpdateHit(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + MapWrapperT m; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + m.BenchInsert(k, MakeValue()); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size; ++i) { + benchmark::DoNotOptimize(i); + + m.BenchErase(lookup_keys[i]); + benchmark::ClobberMemory(); + + bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2()); + CARBON_DCHECK(inserted); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_MapEraseUpdateHit, HitArgs); + +// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here. +#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT, VT) \ + BENCHMARK(NAME>)->Apply(SizeArgs); \ + BENCHMARK(NAME>)->Apply(SizeArgs); \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>>)->Apply(SizeArgs) +// NOLINTEND(bugprone-macro-parentheses) + +#define MAP_BENCHMARK_OP_SEQ(NAME) \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, int); \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*, int*); \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, llvm::StringRef); \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef, int) + +// This is an interesting, somewhat specialized benchmark that measures the cost +// of inserting a sequence of key/value pairs into a table with no collisions up +// to some size and then inserting a colliding key and throwing away the table. +// +// This can give an idea of the cost of building up a map of a particular size, +// but without actually using it. Or of algorithms like cycle-detection which +// for some reason need an associative container. +// +// It also covers both the insert-into-an-empty-slot code path that isn't +// covered elsewhere, and the code path for growing a table to a larger size. +// +// Because this benchmark operates on whole maps, we also compute the number of +// probed keys for Carbon's set as that is both a general reflection of the +// efficacy of the underlying hash function, and a direct factor that drives the +// cost of these operations. +template +static void BM_MapInsertSeq(benchmark::State& state) { + using MapWrapperT = MapWrapper; + using KT = typename MapWrapperT::KeyT; + using VT = typename MapWrapperT::ValueT; + constexpr ssize_t LookupKeysSize = 1 << 8; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), LookupKeysSize); + + // Note that we don't force batches that use all the lookup keys because + // there's no difference in cache usage by covering all the different lookup + // keys. + ssize_t i = 0; + for (auto _ : state) { + benchmark::DoNotOptimize(i); + + MapWrapperT m; + for (auto k : keys) { + bool inserted = m.BenchInsert(k, MakeValue()); + CARBON_DCHECK(inserted) << "Must be a successful insert!"; + } + + // Now insert a final random repeated key. + bool inserted = m.BenchInsert(lookup_keys[i], MakeValue2()); + CARBON_DCHECK(!inserted) << "Must already be in the map!"; + + // Rotate through the shuffled keys. + i = (i + static_cast(!inserted)) & (LookupKeysSize - 1); + } + + // It can be easier in some cases to think of this as a key-throughput rate of + // insertion rather than the latency of inserting N keys, so construct the + // rate counter as well. + state.counters["KeyRate"] = benchmark::Counter( + keys.size(), benchmark::Counter::kIsIterationInvariantRate); + + // Report some extra statistics about the Carbon type. + if constexpr (IsCarbonMap) { + // Re-build a map outside of the timing loop to look at the statistics + // rather than the timing. + MapT m; + for (auto k : keys) { + bool inserted = m.Insert(k, MakeValue()).is_inserted(); + CARBON_DCHECK(inserted) << "Must be a successful insert!"; + } + + // While this count is "iteration invariant" (it should be exactly the same + // for every iteration as the set of keys is the same), we don't use that + // because it will scale this by the number of iterations. We want to + // display the probe count of this benchmark *parameter*, not the probe + // count that resulted from the number of iterations. That means we use the + // normal counter API without flags. + state.counters["Probed"] = m.CountProbedKeys(); + + // Uncomment this call to print out statistics about the index-collisions + // among these keys for debugging: + // + // RawHashtable::DumpHashStatistics(keys); + } +} +MAP_BENCHMARK_ONE_OP(BM_MapInsertSeq, SizeArgs); + +} // namespace +} // namespace Carbon diff --git a/common/map_benchmark_test.sh b/common/map_benchmark_test.sh new file mode 100755 index 0000000000000..8a3d5d60c7635 --- /dev/null +++ b/common/map_benchmark_test.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# +# Part of the Carbon Language project, under the Apache License v2.0 with LLVM +# Exceptions. See /LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/map_benchmark" + +exec "$BENCHMARK" \ + --benchmark_counters_tabular=true \ + --benchmark_min_time=1x \ + --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$' diff --git a/common/map_test.cpp b/common/map_test.cpp new file mode 100644 index 0000000000000..52dbb2ab4a194 --- /dev/null +++ b/common/map_test.cpp @@ -0,0 +1,627 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "common/map.h" + +#include +#include + +#include +#include +#include +#include + +#include "common/raw_hashtable_test_helpers.h" + +namespace Carbon::Testing { +namespace { + +using RawHashtable::FixedHashKeyContext; +using RawHashtable::IndexKeyContext; +using RawHashtable::TestData; +using RawHashtable::TestKeyContext; +using ::testing::Pair; +using ::testing::UnorderedElementsAreArray; + +template +void ExpectMapElementsAre(MapT&& m, MatcherRangeT element_matchers) { + // Now collect the elements into a container. + using KeyT = typename std::remove_reference::type::KeyT; + using ValueT = typename std::remove_reference::type::ValueT; + std::vector> map_entries; + m.ForEach([&map_entries](KeyT& k, ValueT& v) { + map_entries.push_back({k, v}); + }); + + // Use the GoogleMock unordered container matcher to validate and show errors + // on wrong elements. + EXPECT_THAT(map_entries, UnorderedElementsAreArray(element_matchers)); +} + +// Allow directly using an initializer list. +template +void ExpectMapElementsAre(MapT&& m, + std::initializer_list element_matchers) { + std::vector element_matchers_storage = element_matchers; + ExpectMapElementsAre(m, element_matchers_storage); +} + +template +auto MakeKeyValues(ValueCB value_cb, RangeT&& range, RangeTs&&... ranges) { + using KeyT = typename RangeT::value_type; + using ValueT = decltype(value_cb(std::declval())); + std::vector> elements; + auto add_range = [&](RangeT&& r) { + for (const auto&& e : r) { + elements.push_back({e, value_cb(e)}); + } + }; + add_range(std::forward(range)); + (add_range(std::forward(ranges)), ...); + + return elements; +} + +template +class MapTest : public ::testing::Test {}; + +using Types = ::testing::Types< + Map, Map, Map, + Map, Map, + Map, Map, + Map, Map, + Map>; +TYPED_TEST_SUITE(MapTest, Types); + +TYPED_TEST(MapTest, Basic) { + TypeParam m; + + EXPECT_FALSE(m.Contains(42)); + EXPECT_EQ(nullptr, m[42]); + EXPECT_TRUE(m.Insert(1, 100).is_inserted()); + ASSERT_TRUE(m.Contains(1)); + auto result = m.Lookup(1); + EXPECT_TRUE(result); + EXPECT_EQ(1, result.key()); + EXPECT_EQ(100, result.value()); + EXPECT_EQ(100, *m[1]); + // Reinsertion doesn't change the value. + auto i_result = m.Insert(1, 101); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(100, i_result.value()); + EXPECT_EQ(100, *m[1]); + // Update does change the value. + i_result = m.Update(1, 101); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(101, i_result.value()); + EXPECT_EQ(101, *m[1]); + + // Verify all the elements. + ExpectMapElementsAre(m, {Pair(1, 101)}); + + // Fill up a bunch to ensure we trigger growth a few times. + for (int i : llvm::seq(2, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + + // Immediately do a basic check of all elements to pin down when an + // insertion corrupts the rest of the table. + ExpectMapElementsAre( + m, + MakeKeyValues([](int k) { return k * 100 + static_cast(k == 1); }, + llvm::seq_inclusive(1, i))); + } + for (int i : llvm::seq(1, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + static_cast(i == 1), *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + } + EXPECT_FALSE(m.Contains(513)); + + // Verify all the elements. + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512))); +} + +TYPED_TEST(MapTest, FactoryAPI) { + TypeParam m; + EXPECT_TRUE(m.Insert(1, [] { return 100; }).is_inserted()); + ASSERT_TRUE(m.Contains(1)); + EXPECT_EQ(100, *m[1]); + // Reinsertion doesn't invoke the callback. + EXPECT_FALSE(m.Insert(1, []() -> int { + llvm_unreachable("Should never be called!"); + }).is_inserted()); + // Update does invoke the callback. + auto i_result = m.Update(1, [] { return 101; }); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(101, i_result.value()); + EXPECT_EQ(101, *m[1]); +} + +TYPED_TEST(MapTest, Copy) { + using MapT = TypeParam; + + MapT m; + // Make sure we exceed the small size for some of the map types, but not all + // of them, so we cover all the combinations of copying between small and + // large. + for (int i : llvm::seq(1, 24)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + + MapT other_m1 = m; + ExpectMapElementsAre( + other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24))); + + // Add some more elements to the original. + for (int i : llvm::seq(24, 32)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + + // The first copy doesn't change. + ExpectMapElementsAre( + other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24))); + + // A new copy does. + MapT other_m2 = m; + ExpectMapElementsAre( + other_m2, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32))); +} + +TYPED_TEST(MapTest, Move) { + using MapT = TypeParam; + + MapT m; + // Make sure we exceed the small size for some of the map types, but not all + // of them, so we cover all the combinations of moving between small and + // large. + for (int i : llvm::seq(1, 24)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + + MapT other_m1 = std::move(m); + ExpectMapElementsAre( + other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24))); + + // Add some more elements. + for (int i : llvm::seq(24, 32)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(other_m1.Insert(i, i * 100).is_inserted()); + } + ExpectMapElementsAre( + other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32))); +} + +TYPED_TEST(MapTest, Conversions) { + using MapT = TypeParam; + using KeyT = MapT::KeyT; + using ValueT = MapT::ValueT; + using KeyContextT = MapT::KeyContextT; + + MapT m; + + ASSERT_TRUE(m.Insert(1, 101).is_inserted()); + ASSERT_TRUE(m.Insert(2, 102).is_inserted()); + ASSERT_TRUE(m.Insert(3, 103).is_inserted()); + ASSERT_TRUE(m.Insert(4, 104).is_inserted()); + + MapView mv = m; + MapView cmv = m; + MapView cmv2 = m; + MapView cmv3 = m; + EXPECT_TRUE(mv.Contains(1)); + EXPECT_EQ(101, *mv[1]); + EXPECT_TRUE(cmv.Contains(2)); + EXPECT_EQ(102, *cmv[2]); + EXPECT_TRUE(cmv2.Contains(3)); + EXPECT_EQ(103, *cmv2[3]); + EXPECT_TRUE(cmv3.Contains(4)); + EXPECT_EQ(104, *cmv3[4]); +} + +// This test is largely exercising the underlying `RawHashtable` implementation +// with complex growth, erasure, and re-growth. +TYPED_TEST(MapTest, ComplexOpSequence) { + // Use a small size as well to cover more growth scenarios. + TypeParam m; + + EXPECT_FALSE(m.Contains(42)); + EXPECT_EQ(nullptr, m[42]); + EXPECT_TRUE(m.Insert(1, 100).is_inserted()); + ASSERT_TRUE(m.Contains(1)); + auto result = m.Lookup(1); + EXPECT_TRUE(result); + EXPECT_EQ(1, result.key()); + EXPECT_EQ(100, result.value()); + EXPECT_EQ(100, *m[1]); + // Reinsertion doesn't change the value. + auto i_result = m.Insert(1, 101); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(100, i_result.value()); + EXPECT_EQ(100, *m[1]); + // Update does change the value. + i_result = m.Update(1, 101); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(101, i_result.value()); + EXPECT_EQ(101, *m[1]); + + // Verify all the elements. + ExpectMapElementsAre(m, {Pair(1, 101)}); + + // Fill up the small buffer but don't overflow it. + for (int i : llvm::seq(2, 5)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + for (int i : llvm::seq(1, 5)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100 + static_cast(i == 1), *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + static_cast(i == 1), *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + } + EXPECT_FALSE(m.Contains(5)); + + // Verify all the elements. + ExpectMapElementsAre( + m, {Pair(1, 101), Pair(2, 201), Pair(3, 301), Pair(4, 401)}); + + // Erase some entries from the small buffer. + EXPECT_FALSE(m.Erase(42)); + EXPECT_TRUE(m.Erase(2)); + EXPECT_EQ(101, *m[1]); + EXPECT_EQ(nullptr, m[2]); + EXPECT_EQ(301, *m[3]); + EXPECT_EQ(401, *m[4]); + EXPECT_TRUE(m.Erase(1)); + EXPECT_EQ(nullptr, m[1]); + EXPECT_EQ(nullptr, m[2]); + EXPECT_EQ(301, *m[3]); + EXPECT_EQ(401, *m[4]); + EXPECT_TRUE(m.Erase(4)); + EXPECT_EQ(nullptr, m[1]); + EXPECT_EQ(nullptr, m[2]); + EXPECT_EQ(301, *m[3]); + EXPECT_EQ(nullptr, m[4]); + // Fill them back in, but with a different order and going back to the + // original value. + EXPECT_TRUE(m.Insert(1, 100).is_inserted()); + EXPECT_TRUE(m.Insert(2, 200).is_inserted()); + EXPECT_TRUE(m.Insert(4, 400).is_inserted()); + EXPECT_EQ(100, *m[1]); + EXPECT_EQ(200, *m[2]); + EXPECT_EQ(301, *m[3]); + EXPECT_EQ(400, *m[4]); + // Then update their values to match. + EXPECT_FALSE(m.Update(1, 101).is_inserted()); + EXPECT_FALSE(m.Update(2, 201).is_inserted()); + EXPECT_FALSE(m.Update(4, 401).is_inserted()); + + // Now fill up the first metadata group. + for (int i : llvm::seq(5, 14)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + for (int i : llvm::seq(1, 14)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100 + static_cast(i < 5), *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + static_cast(i < 5), *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + 2, *m[i]); + } + EXPECT_FALSE(m.Contains(42)); + + // Verify all the elements by walking the entire map. + ExpectMapElementsAre( + m, {Pair(1, 102), Pair(2, 202), Pair(3, 302), Pair(4, 402), Pair(5, 502), + Pair(6, 602), Pair(7, 702), Pair(8, 802), Pair(9, 902), + Pair(10, 1002), Pair(11, 1102), Pair(12, 1202), Pair(13, 1302)}); + + // Now fill up several more groups. + for (int i : llvm::seq(14, 100)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + for (int i : llvm::seq(1, 100)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100 + 2 * static_cast(i < 14), *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + 2 * static_cast(i < 14), *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 3).is_inserted()); + EXPECT_EQ(i * 100 + 3, *m[i]); + } + EXPECT_FALSE(m.Contains(420)); + + // Check walking the entire container. + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 3; }, llvm::seq(1, 100))); + + // Clear back to empty. + m.Clear(); + EXPECT_FALSE(m.Contains(42)); + EXPECT_EQ(nullptr, m[42]); + + // Refill but with both overlapping and different values. + for (int i : llvm::seq(50, 150)) { + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + } + for (int i : llvm::seq(50, 150)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100, *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100, *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + } + EXPECT_FALSE(m.Contains(42)); + EXPECT_FALSE(m.Contains(420)); + + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(50, 150))); + + EXPECT_FALSE(m.Erase(42)); + EXPECT_TRUE(m.Contains(73)); + EXPECT_TRUE(m.Erase(73)); + EXPECT_FALSE(m.Contains(73)); + for (int i : llvm::seq(102, 136)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Contains(i)); + EXPECT_TRUE(m.Erase(i)); + EXPECT_FALSE(m.Contains(i)); + } + for (int i : llvm::seq(50, 150)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + if (i == 73 || (i >= 102 && i < 136)) { + continue; + } + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100 + 1, *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + 2, *m[i]); + } + EXPECT_TRUE(m.Insert(73, 73 * 100 + 3).is_inserted()); + EXPECT_EQ(73 * 100 + 3, *m[73]); + + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 73); }, + llvm::seq(50, 102), llvm::seq(136, 150))); + + // Reset back to empty and small. + m.Reset(); + EXPECT_FALSE(m.Contains(42)); + EXPECT_EQ(nullptr, m[42]); + + // Refill but with both overlapping and different values, now triggering + // growth too. Also, use update instead of insert. + for (int i : llvm::seq(75, 175)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Update(i, i * 100).is_inserted()); + } + for (int i : llvm::seq(75, 175)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100, *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100, *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + } + EXPECT_FALSE(m.Contains(42)); + EXPECT_FALSE(m.Contains(420)); + + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(75, 175))); + + EXPECT_FALSE(m.Erase(42)); + EXPECT_TRUE(m.Contains(93)); + EXPECT_TRUE(m.Erase(93)); + EXPECT_FALSE(m.Contains(93)); + for (int i : llvm::seq(102, 136)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Contains(i)); + EXPECT_TRUE(m.Erase(i)); + EXPECT_FALSE(m.Contains(i)); + } + for (int i : llvm::seq(75, 175)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + if (i == 93 || (i >= 102 && i < 136)) { + continue; + } + ASSERT_TRUE(m.Contains(i)); + EXPECT_EQ(i * 100 + 1, *m[i]); + EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + 1, *m[i]); + EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted()); + EXPECT_EQ(i * 100 + 2, *m[i]); + } + EXPECT_TRUE(m.Insert(93, 93 * 100 + 3).is_inserted()); + EXPECT_EQ(93 * 100 + 3, *m[93]); + + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 93); }, + llvm::seq(75, 102), llvm::seq(136, 175))); +} + +template +class MapCollisionTest : public ::testing::Test {}; + +using CollisionTypes = ::testing::Types< + Map>, + Map>, + Map>, + Map(0)>>>; +TYPED_TEST_SUITE(MapCollisionTest, CollisionTypes); + +TYPED_TEST(MapCollisionTest, Basic) { + TypeParam m; + + // Fill the map through a couple of growth steps, verifying at each step. Note + // that because this is a collision test, we synthesize actively harmful + // hashes in terms of collisions and so this test is essentially quadratic. We + // need to keep it relatively small. + for (int i : llvm::seq(1, 256)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100).is_inserted()); + + // Immediately do a basic check of all elements to pin down when an + // insertion corrupts the rest of the table. + ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100; }, + llvm::seq_inclusive(1, i))); + } + EXPECT_FALSE(m.Contains(257)); + + // Erase and re-fill from the back. + for (int i : llvm::seq(192, 256)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Erase(i)); + } + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 192))); + for (int i : llvm::seq(192, 256)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted()); + } + ExpectMapElementsAre(m, + MakeKeyValues([](int k) { return k * 100 + (k >= 192); }, + llvm::seq(1, 256))); + + // Erase and re-fill from the front. + for (int i : llvm::seq(1, 64)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Erase(i)); + } + ExpectMapElementsAre(m, + MakeKeyValues([](int k) { return k * 100 + (k >= 192); }, + llvm::seq(64, 256))); + for (int i : llvm::seq(1, 64)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted()); + } + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + (k < 64) + (k >= 192); }, + llvm::seq(1, 256))); + + // Erase and re-fill from the middle. + for (int i : llvm::seq(64, 192)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Erase(i)); + } + ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100 + 1; }, + llvm::seq(1, 64), llvm::seq(192, 256))); + for (int i : llvm::seq(64, 192)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted()); + } + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 256))); + + // Erase and re-fill from both the back and front. + for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) { + for (int i : s) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Erase(i)); + } + } + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(64, 192))); + for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) { + for (int i : s) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100 + 2).is_inserted()); + } + } + ExpectMapElementsAre( + m, + MakeKeyValues([](int k) { return k * 100 + 1 + (k < 64) + (k >= 192); }, + llvm::seq(1, 256))); + + // And update the middle elements in place. + for (int i : llvm::seq(64, 192)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted()); + } + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 2; }, llvm::seq(1, 256))); +} + +TEST(MapContextTest, Basic) { + llvm::SmallVector keys; + for (int i : llvm::seq(0, 513)) { + keys.push_back(i * 100000); + } + IndexKeyContext key_context(keys); + Map> m; + + EXPECT_FALSE(m.Contains(42, key_context)); + EXPECT_TRUE(m.Insert(1, 100, key_context).is_inserted()); + ASSERT_TRUE(m.Contains(1, key_context)); + auto result = m.Lookup(TestData(100000), key_context); + EXPECT_TRUE(result); + EXPECT_EQ(1, result.key()); + EXPECT_EQ(100, result.value()); + // Reinsertion doesn't change the value. Also, double check a temporary + // context. + auto i_result = m.Insert(1, 101, IndexKeyContext(keys)); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(100, i_result.value()); + // Update does change the value. + i_result = m.Update(1, 101, key_context); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_EQ(101, i_result.value()); + + // Verify all the elements. + ExpectMapElementsAre(m, {Pair(1, 101)}); + + // Fill up a bunch to ensure we trigger growth a few times. + for (int i : llvm::seq(2, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(m.Insert(i, i * 100, key_context).is_inserted()); + + // Immediately do a basic check of all elements to pin down when an + // insertion corrupts the rest of the table. + for (int j : llvm::seq(1, i)) { + SCOPED_TRACE(llvm::formatv("Assert key: {0}", j).str()); + ASSERT_EQ(j * 100 + static_cast(j == 1), + m.Lookup(j, key_context).value()); + ASSERT_EQ(j * 100 + static_cast(j == 1), + m.Lookup(TestData(j * 100000), key_context).value()); + } + } + for (int i : llvm::seq(1, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_FALSE(m.Insert(i, i * 100 + 1, key_context).is_inserted()); + EXPECT_EQ(i * 100 + static_cast(i == 1), + m.Lookup(i, key_context).value()); + EXPECT_FALSE(m.Update(i, i * 100 + 1, key_context).is_inserted()); + EXPECT_EQ(i * 100 + 1, m.Lookup(i, key_context).value()); + } + EXPECT_FALSE(m.Contains(0, key_context)); + EXPECT_FALSE(m.Contains(512, key_context)); + + // Verify all the elements. + ExpectMapElementsAre( + m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512))); +} + +} // namespace +} // namespace Carbon::Testing diff --git a/common/ostream.h b/common/ostream.h index d06c42da4d475..be984707af2d7 100644 --- a/common/ostream.h +++ b/common/ostream.h @@ -54,6 +54,7 @@ class Printable { // Returns the result of printing the value. template + requires std::derived_from> inline auto PrintToString(const T& val) -> std::string { std::string str; llvm::raw_string_ostream stream(str); diff --git a/common/raw_hashtable.cpp b/common/raw_hashtable.cpp new file mode 100644 index 0000000000000..d204ac5df30c3 --- /dev/null +++ b/common/raw_hashtable.cpp @@ -0,0 +1,11 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "common/raw_hashtable.h" + +namespace Carbon::RawHashtable { + +volatile std::byte global_addr_seed{1}; + +} // namespace Carbon::RawHashtable diff --git a/common/raw_hashtable.h b/common/raw_hashtable.h new file mode 100644 index 0000000000000..eaff986edf084 --- /dev/null +++ b/common/raw_hashtable.h @@ -0,0 +1,1336 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_RAW_HASHTABLE_H_ +#define CARBON_COMMON_RAW_HASHTABLE_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "common/check.h" +#include "common/hashing.h" +#include "common/hashtable_key_context.h" +#include "common/raw_hashtable_metadata_group.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/MathExtras.h" + +// A namespace collecting a set of low-level utilities for building hashtable +// data structures. These should only be used as implementation details of +// higher-level data-structure APIs. +// +// The utilities here use the `hashtable_key_context.h` provided `KeyContext` to +// support the necessary hashtable operations on keys: hashing and comparison. +// This also serves as the customization point for hashtables built on this +// infrastructure for those operations. See that header file for details. +// +// These utilities support hashtables following a *specific* API design pattern, +// and using Small-Size Optimization, or "SSO", when desired. We expect there to +// be three layers to any hashtable design: +// +// - A *view* type: a read-only view of the hashtable contents. This type should +// be a value type and is expected to be passed by-value in APIs. However, it +// will have `const`-reference semantics, much like a `std::string_view`. Note +// that the *entries* will continue to be mutable, it is only the *table* that +// is read-only. +// +// - A *base* type: a base class type of the actual hashtable, which allows +// almost all mutable operations but erases any specific SSO buffer size. +// Because this is a base of the actual hash table, it is designed to be +// passed as a non-`const` reference or pointer. +// +// - A *table* type: the actual hashtable which derives from the base type and +// adds any desired SSO storage buffer. Beyond the physical storage, it also +// allows resetting the table to its initial state & allocated size, as well +// as copying and moving the table. +// +// For complete examples of the API design, see `set.h` for a hashtable-based +// set data structure, and `map.h` for a hashtable-based map data structure. +// +// The hashtable design implemented here has several key invariants and design +// elements that are essential to all three of the types above and the +// functionality they provide. +// +// - The underlying hashtable uses [open addressing], a power-of-two table size, +// and quadratic probing rather than closed addressing and chaining. +// +// [open addressing]: https://en.wikipedia.org/wiki/Open_addressing +// +// - Each _slot_ in the table corresponds to a key, a value, and one byte of +// metadata. Each _entry_ is a key and value. The key and value for an entry +// are stored together. +// +// - The allocated storage is organized into an array of metadata bytes followed +// by an array of entry storage. +// +// - The metadata byte corresponding to each entry marks that entry is either +// empty, deleted, or present. When present, a 7-bit tag is also stored using +// another 7 bits from the hash of the entry key. +// +// - The storage for an entry is an internal type that should not be exposed to +// users, and instead only the underlying keys and values. +// +// - The hash addressing and probing occurs over *groups* of slots rather than +// individual entries. When inserting a new entry, it can be added to the +// group it hashes to as long it is not full, and can even replace a slot with +// a tombstone indicating a previously deleted entry. Only when the group is +// full will it look at the next group in the probe sequence. As a result, +// there may be entries in a group where a different group is the start of +// that entry's probe sequence. Also, when performing a lookup, every group in +// the probe sequence must be inspected for the lookup key until it is found +// or the group has an empty slot. +// +// - Groups are scanned rapidly using the one-byte metadata for each entry in +// the group and CPU instructions that allow comparing all of the metadata for +// a group in parallel. For more details on the metadata group encoding and +// scanning, see `raw_hashtable_metadata_group.h`. +// +// - `GroupSize` is a platform-specific relatively small power of two that fits +// in some hardware register. However, `MaxGroupSize` is provided as a +// portable max that is also a power of two. The table storage, whether +// provided by an SSO buffer or allocated, is required to be a multiple of +// `MaxGroupSize` to keep the requirement portable but sufficient for all +// platforms. +// +// - There is *always* an allocated table of some multiple of `MaxGroupSize`. +// This allows accesses to be branchless. When heap allocated, we pro-actively +// allocate at least a minimum heap size table. When there is a small-size +// optimization (SSO) buffer, that provides the initial allocation. +// +// - The table performs a minimal amount of bookkeeping that limits the APIs it +// can support: +// - `alloc_size` is the size of the table *allocated* (not *used*), and is +// always a power of 2 at least as big as `MinAllocatedSize`. +// - `storage` is a pointer to the storage for the `alloc_size` slots of the +// table, and never null. +// - `small_alloc_size` is the maximum `alloc_size` where the table is stored +// in the object itself instead of separately on the heap. In this case, +// `storage` points to `small_storage_`. +// - `growth_budget` is the number of entries that may be added before the +// table allocation is doubled. It is always +// `GrowthThresholdForAllocSize(alloc_size)` minus the number of +// non-empty (filled or deleted) slots. If it ever falls to 0, the table +// is grown to keep it greater than 0. +// There is also the "moved-from" state where the table may only be +// reinitialized or destroyed where the `alloc_size` is 0 and `storage` is +// null. Since it doesn't track the exact number of filled entries in a table, +// it doesn't support a container-style `size` API. +// +// - There is no direct iterator support because of the complexity of embedding +// the group-based metadata scanning into an iterator model. Instead, there is +// just a for-each method that is passed a lambda to observe all entries. The +// order of this observation is also not guaranteed. +namespace Carbon::RawHashtable { + +// If allocating storage, allocate a minimum of one cacheline of group metadata +// or a minimum of one group, whichever is larger. +constexpr ssize_t MinAllocatedSize = std::max(64, MaxGroupSize); + +// An entry in the hashtable storage of a `KeyT` and `ValueT` object. +// +// Allows manual construction, destruction, and access to these values so we can +// create arrays af the entries prior to populating them with actual keys and +// values. +template +struct StorageEntry { + static constexpr bool IsTriviallyDestructible = + std::is_trivially_destructible_v && + std::is_trivially_destructible_v; + + static constexpr bool IsTriviallyRelocatable = + IsTriviallyDestructible && std::is_trivially_move_constructible_v && + std::is_trivially_move_constructible_v; + + auto key() const -> const KeyT& { + // Ensure we don't need more alignment than available. Inside a method body + // to apply to the complete type. + static_assert( + alignof(StorageEntry) <= MinAllocatedSize, + "The minimum allocated size turns into the alignment of our array of " + "storage entries as they follow the metadata byte array."); + + return *std::launder(reinterpret_cast(&key_storage)); + } + auto key() -> KeyT& { + return const_cast(const_cast(this)->key()); + } + + auto value() const -> const ValueT& { + return *std::launder(reinterpret_cast(&value_storage)); + } + auto value() -> ValueT& { + return const_cast(const_cast(this)->value()); + } + + // We handle destruction and move manually as we only want to expose distinct + // `KeyT` and `ValueT` subobjects to user code that may need to do in-place + // construction. As a consequence, this struct only provides the storage and + // we have to manually manage the construction, move, and destruction of the + // objects. + auto Destroy() -> void { + static_assert(!IsTriviallyDestructible, + "Should never instantiate when trivial!"); + key().~KeyT(); + value().~ValueT(); + } + + auto CopyFrom(const StorageEntry& entry) -> void { + if constexpr (IsTriviallyRelocatable) { + memcpy(this, &entry, sizeof(StorageEntry)); + } else { + new (&key_storage) KeyT(entry.key()); + new (&value_storage) ValueT(entry.value()); + } + } + + // Move from an expiring entry and destroy that entry's key and value. + // Optimizes to directly use `memcpy` when correct. + auto MoveFrom(StorageEntry&& entry) -> void { + if constexpr (IsTriviallyRelocatable) { + memcpy(this, &entry, sizeof(StorageEntry)); + } else { + new (&key_storage) KeyT(std::move(entry.key())); + entry.key().~KeyT(); + new (&value_storage) ValueT(std::move(entry.value())); + entry.value().~ValueT(); + } + } + + alignas(KeyT) std::byte key_storage[sizeof(KeyT)]; + alignas(ValueT) std::byte value_storage[sizeof(ValueT)]; +}; + +// A specialization of the storage entry for sets without a distinct value type. +// Somewhat duplicative with the key-value version, but C++ specialization makes +// doing better difficult. +template +struct StorageEntry { + static constexpr bool IsTriviallyDestructible = + std::is_trivially_destructible_v; + + static constexpr bool IsTriviallyRelocatable = + IsTriviallyDestructible && std::is_trivially_move_constructible_v; + + auto key() const -> const KeyT& { + // Ensure we don't need more alignment than available. + static_assert( + alignof(StorageEntry) <= MinAllocatedSize, + "The minimum allocated size turns into the alignment of our array of " + "storage entries as they follow the metadata byte array."); + + return *std::launder(reinterpret_cast(&key_storage)); + } + auto key() -> KeyT& { + return const_cast(const_cast(this)->key()); + } + + auto Destroy() -> void { + static_assert(!IsTriviallyDestructible, + "Should never instantiate when trivial!"); + key().~KeyT(); + } + + auto CopyFrom(const StorageEntry& entry) -> void { + if constexpr (IsTriviallyRelocatable) { + memcpy(this, &entry, sizeof(StorageEntry)); + } else { + new (&key_storage) KeyT(entry.key()); + } + } + + auto MoveFrom(StorageEntry&& entry) -> void { + if constexpr (IsTriviallyRelocatable) { + memcpy(this, &entry, sizeof(StorageEntry)); + } else { + new (&key_storage) KeyT(std::move(entry.key())); + entry.key().~KeyT(); + } + } + + alignas(KeyT) std::byte key_storage[sizeof(KeyT)]; +}; + +// A placeholder empty type used to model pointers to the allocated buffer of +// storage. +// +// The allocated storage doesn't have a meaningful static layout -- it consists +// of an array of metadata groups followed by an array of storage entries. +// However, we want to be able to mark pointers to this and so use pointers to +// this placeholder type as that signifier. +// +// This is a complete, empty type so that it can be used as a base class of a +// specific concrete storage type for compile-time sized storage. +struct Storage {}; + +// Forward declaration to support friending, see the definition below. +template +class BaseImpl; + +// Implementation helper for defining a read-only view type for a hashtable. +// +// A specific user-facing hashtable view type should derive privately from this +// type, and forward the implementation of its interface to functions in this +// type. +// +// The methods available to user-facing hashtable types are `protected`, and +// where they are expected to directly map to a public API, named with an +// `Impl`. The suffix naming ensures types don't `using` in these low-level APIs +// but declare their own and implement them by forwarding to these APIs. We +// don't want users to have to read these implementation details to understand +// their container's API, so none of these methods should be `using`-ed into the +// user facing types. +// +// Some of the types are just convenience aliases and aren't important to +// surface as part of the user-facing type API for readers and so those are +// reasonable to add via a `using`. +// +// Some methods are used by other parts of the raw hashtable implementation. +// Those are kept `private` and where necessary the other components of the raw +// hashtable implementation are friended to give access to them. +template +class ViewImpl { + protected: + using KeyT = InputKeyT; + using ValueT = InputValueT; + using KeyContextT = InputKeyContextT; + using EntryT = StorageEntry; + + friend class BaseImpl; + + // Make more-`const` types friends to enable conversions that add `const`. + friend class ViewImpl; + friend class ViewImpl; + friend class ViewImpl; + + ViewImpl() = default; + + // Support adding `const` to either key or value type of some other view. + template + // NOLINTNEXTLINE(google-explicit-constructor) + ViewImpl(ViewImpl other_view) + requires(std::same_as || + std::same_as) && + (std::same_as || + std::same_as) + : alloc_size_(other_view.alloc_size_), storage_(other_view.storage_) {} + + // Looks up an entry in the hashtable and returns its address or null if not + // present. + template + auto LookupEntry(LookupKeyT lookup_key, KeyContextT key_context) const + -> EntryT*; + + // Calls `entry_callback` for each entry in the hashtable. All the entries + // within a specific group are visited first, and then `group_callback` is + // called on the group itself. The `group_callback` is typically only used by + // the internals of the hashtable. + template + auto ForEachEntry(EntryCallbackT entry_callback, + GroupCallbackT group_callback) const -> void; + + // Counts the number of keys in the hashtable that required probing beyond the + // initial group. + auto CountProbedKeys(KeyContextT key_context) const -> ssize_t; + + private: + ViewImpl(ssize_t alloc_size, Storage* storage) + : alloc_size_(alloc_size), storage_(storage) {} + + // Computes the offset from the metadata array to the entries array for a + // given size. This is trivial, but we use this routine to enforce invariants + // on the sizes. + static constexpr auto EntriesOffset(ssize_t alloc_size) -> ssize_t { + CARBON_DCHECK(llvm::isPowerOf2_64(alloc_size)) + << "Size must be a power of two for a hashed buffer!"; + // The size is always a power of two. We prevent any too-small sizes so it + // being a power of two provides the needed alignment. As a result, the + // offset is exactly the size. We validate this here to catch alignment bugs + // early. + CARBON_DCHECK(static_cast(alloc_size) == + llvm::alignTo(alloc_size)); + return alloc_size; + } + + auto metadata() const -> uint8_t* { + return reinterpret_cast(storage_); + } + auto entries() const -> EntryT* { + return reinterpret_cast(reinterpret_cast(storage_) + + EntriesOffset(alloc_size_)); + } + + ssize_t alloc_size_; + Storage* storage_; +}; + +// Implementation helper for defining a read-write base type for a hashtable +// that type-erases any SSO buffer. +// +// A specific user-facing hashtable base type should derive using *`protected`* +// inheritance from this type, and forward the implementation of its interface +// to functions in this type. +// +// Other than the use of `protected` inheritance, the patterns for this type, +// and how to build user-facing hashtable base types from it, mirror those of +// `ViewImpl`. See its documentation for more details. +template +class BaseImpl { + protected: + using KeyT = InputKeyT; + using ValueT = InputValueT; + using KeyContextT = InputKeyContextT; + using ViewImplT = ViewImpl; + using EntryT = typename ViewImplT::EntryT; + + BaseImpl(int small_alloc_size, Storage* small_storage) + : small_alloc_size_(small_alloc_size) { + CARBON_CHECK(small_alloc_size >= 0); + Construct(small_storage); + } + // Only used for copying and moving, and leaves storage uninitialized. + BaseImpl(ssize_t alloc_size, int growth_budget, int small_alloc_size) + : view_impl_(alloc_size, nullptr), + growth_budget_(growth_budget), + small_alloc_size_(small_alloc_size) {} + ~BaseImpl(); + + // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay. + operator ViewImplT() const { return view_impl(); } + + auto view_impl() const -> ViewImplT { return view_impl_; } + + // Looks up the provided key in the hashtable. If found, returns a pointer to + // that entry and `false`. + // + // If not found, will locate an empty entry for inserting into, set the + // metadata for that entry, and return a pointer to the entry and `true`. When + // necessary, this will grow the hashtable to cause there to be sufficient + // empty entries. + template + auto InsertImpl(LookupKeyT lookup_key, KeyContextT key_context) + -> std::pair; + + // Looks up the entry in the hashtable, and if found destroys the entry and + // returns `true`. If not found, returns `false`. + // + // Does not release any memory, just leaves a tombstone behind so this entry + // cannot be found and the slot can in theory be re-used. + template + auto EraseImpl(LookupKeyT lookup_key, KeyContextT key_context) -> bool; + + // Erases all entries in the hashtable but leaves the allocated storage. + auto ClearImpl() -> void; + + private: + template + friend class TableImpl; + + static constexpr ssize_t Alignment = std::max( + {alignof(MetadataGroup), alignof(StorageEntry)}); + + // Implementation of inline small storage for the provided key type, value + // type, and small size. Specialized for a zero small size to be an empty + // struct. + template + struct SmallStorage : Storage { + alignas(Alignment) uint8_t metadata[SmallSize]; + mutable StorageEntry entries[SmallSize]; + }; + // Specialized storage with no inline buffer to avoid any extra alignment. + template <> + struct SmallStorage<0> {}; + + static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t { + return ViewImplT::EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size; + } + static auto Allocate(ssize_t alloc_size) -> Storage*; + static auto Deallocate(Storage* storage, ssize_t alloc_size) -> void; + + auto growth_budget() const -> ssize_t { return growth_budget_; } + auto alloc_size() const -> ssize_t { return view_impl_.alloc_size_; } + auto alloc_size() -> ssize_t& { return view_impl_.alloc_size_; } + auto storage() const -> Storage* { return view_impl_.storage_; } + auto storage() -> Storage*& { return view_impl_.storage_; } + auto metadata() const -> uint8_t* { return view_impl_.metadata(); } + auto entries() const -> EntryT* { return view_impl_.entries(); } + auto small_alloc_size() const -> ssize_t { + return static_cast(small_alloc_size_); + } + auto is_small() const -> bool { return alloc_size() <= small_alloc_size(); } + + auto Construct(Storage* small_storage) -> void; + auto Destroy() -> void; + + template + auto InsertIntoEmpty(LookupKeyT lookup_key, KeyContextT key_context) + -> EntryT*; + + static auto ComputeNextAllocSize(ssize_t old_alloc_size) -> ssize_t; + static auto GrowthThresholdForAllocSize(ssize_t alloc_size) -> ssize_t; + + template + auto GrowAndInsert(LookupKeyT lookup_key, KeyContextT key_context) -> EntryT*; + + ViewImplT view_impl_; + int growth_budget_; + int small_alloc_size_; +}; + +// Implementation helper for defining a hashtable type with an SSO buffer. +// +// A specific user-facing hashtable should derive privately from this +// type, and forward the implementation of its interface to functions in this +// type. It should provide the corresponding user-facing hashtable base type as +// the `InputBaseT` type parameter (rather than a key/value pair), and this type +// will in turn derive from that provided base type. This allows derived-to-base +// conversion from the user-facing hashtable type to the user-facing hashtable +// base type. And it does so keeping the inheritance linear. The resulting +// linear inheritance hierarchy for a `Map` type will look like: +// +// Map +// ↓ +// TableImpl> +// ↓ +// MapBase +// ↓ +// BaseImpl +// +// Other than this inheritance technique, the patterns for this type, and how to +// build user-facing hashtable types from it, mirror those of `ViewImpl`. See +// its documentation for more details. +template +class TableImpl : public InputBaseT { + protected: + using BaseT = InputBaseT; + + TableImpl() : BaseT(SmallSize, small_storage()) {} + TableImpl(const TableImpl& arg); + TableImpl(TableImpl&& arg) noexcept; + + // Resets the hashtable to its initial state, clearing all entries and + // releasing all memory. If the hashtable had an SSO buffer, that is restored + // as the storage. Otherwise, a minimum sized table storage is allocated. + auto ResetImpl() -> void; + + private: + using KeyT = BaseT::KeyT; + using ValueT = BaseT::ValueT; + using EntryT = BaseT::EntryT; + using SmallStorage = BaseT::template SmallStorage; + + auto small_storage() const -> Storage*; + + [[no_unique_address]] mutable SmallStorage small_storage_; +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Only implementation details below this point. +// +//////////////////////////////////////////////////////////////////////////////// + +// Computes a seed that provides a small amount of entropy from ASLR where +// available with minimal cost. The priority is speed, and this computes the +// entropy in a way that doesn't require loading from memory, merely accessing +// entropy already available without accessing memory. +inline auto ComputeSeed() -> uint64_t { + // A global variable whose address is used as a seed. This allows ASLR to + // introduce some variation in hashtable ordering when enabled via the code + // model for globals. + extern volatile std::byte global_addr_seed; + + return reinterpret_cast(&global_addr_seed); +} + +inline auto ComputeProbeMaskFromSize(ssize_t size) -> size_t { + CARBON_DCHECK(llvm::isPowerOf2_64(size)) + << "Size must be a power of two for a hashed buffer!"; + // Since `size` is a power of two, we can make sure the probes are less + // than `size` by making the mask `size - 1`. We also mask off the low + // bits so the probes are a multiple of the size of the groups of entries. + return (size - 1) & ~GroupMask; +} + +// This class handles building a sequence of probe indices from a given +// starting point, including both the quadratic growth and masking the index +// to stay within the bucket array size. The starting point doesn't need to be +// clamped to the size ahead of time (or even be positive), we will do it +// internally. +// +// For reference on quadratic probing: +// https://en.wikipedia.org/wiki/Quadratic_probing +// +// We compute the quadratic probe index incrementally, but we can also compute +// it mathematically and will check that the incremental result matches our +// mathematical expectation. We use the quadratic probing formula of: +// +// p(start, step) = (start + (step + step^2) / 2) (mod size / GroupSize) +// +// However, we compute it incrementally and scale all the variables by the group +// size so it can be used as an index without an additional multiplication. +class ProbeSequence { + public: + ProbeSequence(ssize_t start, ssize_t size) { + mask_ = ComputeProbeMaskFromSize(size); + p_ = start & mask_; +#ifndef NDEBUG + start_ = start & mask_; + size_ = size; +#endif + } + + void Next() { + step_ += GroupSize; + p_ = (p_ + step_) & mask_; +#ifndef NDEBUG + // Verify against the quadratic formula we expect to be following by scaling + // everything down by `GroupSize`. + CARBON_DCHECK( + (p_ / GroupSize) == + ((start_ / GroupSize + + (step_ / GroupSize + (step_ / GroupSize) * (step_ / GroupSize)) / 2) % + (size_ / GroupSize))) + << "Index in probe sequence does not match the expected formula."; + CARBON_DCHECK(step_ < size_) << "We necessarily visit all groups, so we " + "can't have more probe steps than groups."; +#endif + } + + auto index() const -> ssize_t { return p_; } + + private: + ssize_t step_ = 0; + size_t mask_; + ssize_t p_; +#ifndef NDEBUG + ssize_t start_; + ssize_t size_; +#endif +}; + +// TODO: Evaluate keeping this outlined to see if macro benchmarks observe the +// same perf hit as micro benchmarks. +template +template +auto ViewImpl::LookupEntry( + LookupKeyT lookup_key, KeyContextT key_context) const -> EntryT* { + // Prefetch with a "low" temporal locality as we're primarily expecting a + // brief use of the storage and then to return to application code. + __builtin_prefetch(storage_, /*read*/ 0, /*low-locality*/ 1); + + ssize_t local_size = alloc_size_; + CARBON_DCHECK(local_size > 0); + + uint8_t* local_metadata = metadata(); + HashCode hash = key_context.HashKey(lookup_key, ComputeSeed()); + auto [hash_index, tag] = hash.ExtractIndexAndTag<7>(); + + EntryT* local_entries = entries(); + + // Walk through groups of entries using a quadratic probe starting from + // `hash_index`. + ProbeSequence s(hash_index, local_size); + do { + ssize_t group_index = s.index(); + + // For each group, match the tag against the metadata to extract the + // potentially matching entries within the group. + MetadataGroup g = MetadataGroup::Load(local_metadata, group_index); + auto metadata_matched_range = g.Match(tag); + if (LLVM_LIKELY(metadata_matched_range)) { + // If any entries in this group potentially match based on their metadata, + // walk each candidate and compare its key to see if we have definitively + // found a match. + EntryT* group_entries = &local_entries[group_index]; + auto byte_it = metadata_matched_range.begin(); + auto byte_end = metadata_matched_range.end(); + do { + EntryT* entry = byte_it.index_ptr(group_entries); + if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) { + __builtin_assume(entry != nullptr); + return entry; + } + ++byte_it; + } while (LLVM_UNLIKELY(byte_it != byte_end)); + } + + // We failed to find a matching entry in this bucket, so check if there are + // empty slots as that indicates we're done probing -- no later probed index + // could have a match. + auto empty_byte_matched_range = g.MatchEmpty(); + if (LLVM_LIKELY(empty_byte_matched_range)) { + return nullptr; + } + + s.Next(); + + // We use a weird construct of an "unlikely" condition of `true`. The goal + // is to get the compiler to not prioritize the back edge of the loop for + // code layout, and in at least some tests this seems to be an effective + // construct for achieving this. + } while (LLVM_UNLIKELY(true)); +} + +// Note that we force inlining here because we expect to be called with lambdas +// that will in turn be inlined to form the loop body. We don't want function +// boundaries within the loop for performance, and recognizing the degree of +// simplification from inlining these callbacks may be difficult to +// automatically recognize. +template +template +[[clang::always_inline]] auto +ViewImpl::ForEachEntry( + EntryCallbackT entry_callback, GroupCallbackT group_callback) const + -> void { + uint8_t* local_metadata = metadata(); + EntryT* local_entries = entries(); + + ssize_t local_size = alloc_size_; + for (ssize_t group_index = 0; group_index < local_size; + group_index += GroupSize) { + auto g = MetadataGroup::Load(local_metadata, group_index); + auto present_matched_range = g.MatchPresent(); + if (!present_matched_range) { + continue; + } + for (ssize_t byte_index : present_matched_range) { + entry_callback(local_entries[group_index + byte_index]); + } + + group_callback(&local_metadata[group_index]); + } +} + +template +auto ViewImpl::CountProbedKeys( + KeyContextT key_context) const -> ssize_t { + uint8_t* local_metadata = metadata(); + EntryT* local_entries = entries(); + ssize_t local_size = alloc_size_; + ssize_t count = 0; + for (ssize_t group_index = 0; group_index < local_size; + group_index += GroupSize) { + auto g = MetadataGroup::Load(local_metadata, group_index); + auto present_matched_range = g.MatchPresent(); + for (ssize_t byte_index : present_matched_range) { + ssize_t index = group_index + byte_index; + HashCode hash = + key_context.HashKey(local_entries[index].key(), ComputeSeed()); + ssize_t hash_index = hash.ExtractIndexAndTag<7>().first & + ComputeProbeMaskFromSize(local_size); + count += static_cast(hash_index != group_index); + } + } + return count; +} + +template +BaseImpl::~BaseImpl() { + Destroy(); +} + +// TODO: Evaluate whether it is worth forcing this out-of-line given the +// reasonable ABI boundary it forms and large volume of code necessary to +// implement it. +template +template +auto BaseImpl::InsertImpl( + LookupKeyT lookup_key, KeyContextT key_context) + -> std::pair { + CARBON_DCHECK(alloc_size() > 0); + + uint8_t* local_metadata = metadata(); + + HashCode hash = key_context.HashKey(lookup_key, ComputeSeed()); + auto [hash_index, tag] = hash.ExtractIndexAndTag<7>(); + + // We re-purpose the empty control byte to signal no insert is needed to the + // caller. This is guaranteed to not be a control byte we're inserting. + // constexpr uint8_t NoInsertNeeded = Group::Empty; + + ssize_t group_with_deleted_index; + MetadataGroup::MatchIndex deleted_match = {}; + + EntryT* local_entries = entries(); + + auto return_insert_at_index = [&](ssize_t index) -> std::pair { + // We'll need to insert at this index so set the control group byte to the + // proper value. + local_metadata[index] = tag | MetadataGroup::PresentMask; + return {&local_entries[index], true}; + }; + + for (ProbeSequence s(hash_index, alloc_size());; s.Next()) { + ssize_t group_index = s.index(); + auto g = MetadataGroup::Load(local_metadata, group_index); + + auto control_byte_matched_range = g.Match(tag); + if (control_byte_matched_range) { + EntryT* group_entries = &local_entries[group_index]; + auto byte_it = control_byte_matched_range.begin(); + auto byte_end = control_byte_matched_range.end(); + do { + EntryT* entry = byte_it.index_ptr(group_entries); + if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) { + return {entry, false}; + } + ++byte_it; + } while (LLVM_UNLIKELY(byte_it != byte_end)); + } + + // Track the first group with a deleted entry that we could insert over. + if (!deleted_match) { + deleted_match = g.MatchDeleted(); + group_with_deleted_index = group_index; + } + + // We failed to find a matching entry in this bucket, so check if there are + // no empty slots. In that case, we'll continue probing. + auto empty_match = g.MatchEmpty(); + if (!empty_match) { + continue; + } + // Ok, we've finished probing without finding anything and need to insert + // instead. + + // If we found a deleted slot, we don't need the probe sequence to insert + // so just bail. We want to ensure building up a table is fast so we + // de-prioritize this a bit. In practice this doesn't have too much of an + // effect. + if (LLVM_UNLIKELY(deleted_match)) { + return return_insert_at_index(group_with_deleted_index + + deleted_match.index()); + } + + // We're going to need to grow by inserting into an empty slot. Check that + // we have the budget for that before we compute the exact index of the + // empty slot. Without the growth budget we'll have to completely rehash and + // so we can just bail here. + if (LLVM_UNLIKELY(growth_budget_ == 0)) { + return {GrowAndInsert(lookup_key, key_context), true}; + } + + --growth_budget_; + CARBON_DCHECK(growth_budget() >= 0) + << "Growth budget shouldn't have gone negative!"; + return return_insert_at_index(group_index + empty_match.index()); + } + + CARBON_FATAL() << "We should never finish probing without finding the entry " + "or an empty slot."; +} + +template +template +auto BaseImpl::EraseImpl( + LookupKeyT lookup_key, KeyContextT key_context) -> bool { + EntryT* entry = view_impl_.LookupEntry(lookup_key, key_context); + if (!entry) { + return false; + } + + // If there are empty slots in this group then nothing will probe past this + // group looking for an entry so we can simply set this slot to empty as + // well. However, if every slot in this group is full, it might be part of + // a long probe chain that we can't disrupt. In that case we mark the slot's + // metadata as deleted to keep probes continuing past it. + // + // If we mark the slot as empty, we'll also need to increase the growth + // budget. + uint8_t* local_metadata = metadata(); + EntryT* local_entries = entries(); + ssize_t index = entry - local_entries; + ssize_t group_index = index & ~GroupMask; + auto g = MetadataGroup::Load(local_metadata, group_index); + auto empty_matched_range = g.MatchEmpty(); + if (empty_matched_range) { + local_metadata[index] = MetadataGroup::Empty; + ++growth_budget_; + } else { + local_metadata[index] = MetadataGroup::Deleted; + } + + if constexpr (!EntryT::IsTriviallyDestructible) { + entry->Destroy(); + } + + return true; +} + +template +auto BaseImpl::ClearImpl() -> void { + view_impl_.ForEachEntry( + [](EntryT& entry) { + if constexpr (!EntryT::IsTriviallyDestructible) { + entry.Destroy(); + } + }, + [](uint8_t* metadata_group) { + // Clear the group. + std::memset(metadata_group, 0, GroupSize); + }); + growth_budget_ = GrowthThresholdForAllocSize(alloc_size()); +} + +// Allocates the appropriate memory layout for a table of the given +// `alloc_size`, with space both for the metadata array and entries. +// +// The returned pointer *must* be deallocated by calling the below `Deallocate` +// function with the same `alloc_size` as used here. +template +auto BaseImpl::Allocate( + ssize_t alloc_size) -> Storage* { + return reinterpret_cast(__builtin_operator_new( + AllocByteSize(alloc_size), static_cast(Alignment), + std::nothrow_t())); +} + +// Deallocates a table's storage that was allocated with the `Allocate` +// function. +template +auto BaseImpl::Deallocate( + Storage* storage, ssize_t alloc_size) -> void { + ssize_t allocated_size = AllocByteSize(alloc_size); + // We don't need the size, but make sure it always compiles. + static_cast(allocated_size); + __builtin_operator_delete(storage, +#if __cpp_sized_deallocation + allocated_size, +#endif + static_cast(Alignment)); +} + +// Construct a table using the provided small storage if `small_alloc_size_` is +// non-zero. If `small_alloc_size_` is zero, then `small_storage` won't be used +// and can be null. Regardless, after this the storage pointer is non-null and +// the size is non-zero so that we can directly begin inserting or querying the +// table. +template +auto BaseImpl::Construct( + Storage* small_storage) -> void { + if (small_alloc_size_ > 0) { + alloc_size() = small_alloc_size_; + storage() = small_storage; + } else { + // Directly allocate the initial buffer so that the hashtable is never in + // an empty state. + alloc_size() = MinAllocatedSize; + storage() = Allocate(MinAllocatedSize); + } + std::memset(metadata(), 0, alloc_size()); + growth_budget_ = GrowthThresholdForAllocSize(alloc_size()); +} + +// Destroy the current table, releasing any memory used. +template +auto BaseImpl::Destroy() -> void { + // Check for a moved-from state and don't do anything. Only a moved-from table + // has a zero size. + if (alloc_size() == 0) { + return; + } + + // Destroy all the entries. + if constexpr (!EntryT::IsTriviallyDestructible) { + view_impl_.ForEachEntry([](EntryT& entry) { entry.Destroy(); }, + [](auto...) {}); + } + + // If small, nothing to deallocate. + if (is_small()) { + return; + } + + // Just deallocate the storage without updating anything when destroying the + // object. + Deallocate(storage(), alloc_size()); +} + +// Optimized routine to insert a key into a table when that key *definitely* +// isn't present in the table and the table *definitely* has a viable empty slot +// (and growth space) to insert into before any deleted slots. When both of +// these are true, typically just after growth, we can dramatically simplify the +// insert position search. +template +template +[[clang::noinline]] auto +BaseImpl::InsertIntoEmpty( + LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* { + HashCode hash = key_context.HashKey(lookup_key, ComputeSeed()); + auto [hash_index, tag] = hash.ExtractIndexAndTag<7>(); + uint8_t* local_metadata = metadata(); + EntryT* local_entries = entries(); + + for (ProbeSequence s(hash_index, alloc_size());; s.Next()) { + ssize_t group_index = s.index(); + auto g = MetadataGroup::Load(local_metadata, group_index); + + if (auto empty_match = g.MatchEmpty()) { + ssize_t index = group_index + empty_match.index(); + local_metadata[index] = tag | MetadataGroup::PresentMask; + return &local_entries[index]; + } + + // Otherwise we continue probing. + } +} + +// Apply our doubling growth strategy and (re-)check invariants around table +// size. +template +auto BaseImpl::ComputeNextAllocSize( + ssize_t old_alloc_size) -> ssize_t { + CARBON_DCHECK(llvm::isPowerOf2_64(old_alloc_size)) + << "Expected a power of two!"; + ssize_t new_alloc_size; + bool overflow = __builtin_mul_overflow(old_alloc_size, 2, &new_alloc_size); + CARBON_CHECK(!overflow) << "Computing the new size overflowed `ssize_t`!"; + return new_alloc_size; +} + +// Compute the growth threshold for a given size. +template +auto BaseImpl::GrowthThresholdForAllocSize(ssize_t alloc_size) + -> ssize_t { + // We use a 7/8ths load factor to trigger growth. + return alloc_size - alloc_size / 8; +} + +// Grow the hashtable to create space and then insert into it. Returns the +// selected insertion entry. Never returns null. In addition to growing and +// selecting the insertion entry, this routine updates the metadata array so +// that this function can be directly called and the result returned from +// `InsertImpl`. +template +template +[[clang::noinline]] auto +BaseImpl::GrowAndInsert( + LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* { + // We collect the probed elements in a small vector for re-insertion. It is + // tempting to reuse the already allocated storage, but doing so appears to + // be a (very slight) performance regression. These are relatively rare and + // storing them into the existing storage creates stores to the same regions + // of memory we're reading. Moreover, it requires moving both the key and the + // value twice, and doing the `memcpy` widening for relocatable types before + // the group walk rather than after the group walk. In practice, between the + // statistical rareness and using a large small size buffer here on the stack, + // we can handle this most efficiently with temporary, additional storage. + llvm::SmallVector probed_indices; + + // We grow into a new `MapBase` so that both the new and old maps are + // fully functional until all the entries are moved over. However, we directly + // manipulate the internals to short circuit many aspects of the growth. + ssize_t old_size = alloc_size(); + CARBON_DCHECK(old_size > 0); + CARBON_DCHECK(growth_budget_ == 0); + + bool old_small = is_small(); + Storage* old_storage = storage(); + uint8_t* old_metadata = metadata(); + EntryT* old_entries = entries(); + +#ifndef NDEBUG + // Count how many of the old table slots will end up being empty after we grow + // the table. This is both the currently empty slots, but also the deleted + // slots because we clear them to empty and re-insert everything that had any + // probing. + ssize_t debug_empty_count = + llvm::count(llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Empty); + ssize_t debug_deleted_count = llvm::count( + llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Deleted); + CARBON_DCHECK(debug_empty_count >= + (old_size - GrowthThresholdForAllocSize(old_size))) + << "debug_empty_count: " << debug_empty_count + << ", debug_deleted_count: " << debug_deleted_count + << ", size: " << old_size; +#endif + + // Compute the new size and grow the storage in place (if possible). + ssize_t new_size = ComputeNextAllocSize(old_size); + alloc_size() = new_size; + storage() = Allocate(new_size); + growth_budget_ = GrowthThresholdForAllocSize(new_size); + + // Now extract the new components of the table. + uint8_t* new_metadata = metadata(); + EntryT* new_entries = entries(); + + // We always double the size when we grow. This allows an important + // optimization -- we're adding exactly one more high bit to the hash-computed + // index for each entry. This in turn means we can classify every entry in the + // table into three cases: + // + // 1) The new high bit is zero, the entry is at the same index in the new + // table as the old. + // + // 2) The new high bit is one, the entry is at the old index plus the old + // size. + // + // 3) The entry's current index doesn't match the initial hash index because + // it required some amount of probing to find an empty slot. + // + // The design of the hash table tries to minimize how many entries fall into + // case (3), so we expect the vast majority of entries to be in (1) or (2). + // This lets us model growth notionally as duplicating the hash table, + // clearing out the empty slots, and inserting any probed elements. + + ssize_t count = 0; + for (ssize_t group_index = 0; group_index < old_size; + group_index += GroupSize) { + auto low_g = MetadataGroup::Load(old_metadata, group_index); + // Make sure to match present elements first to enable pipelining with + // clearing. + auto present_matched_range = low_g.MatchPresent(); + low_g.ClearDeleted(); + MetadataGroup high_g; + if constexpr (MetadataGroup::FastByteClear) { + // When we have a fast byte clear, we can update the metadata for the + // growth in-register and store at the end. + high_g = low_g; + } else { + // If we don't have a fast byte clear, we can store the metadata group + // eagerly here and overwrite bytes with a byte store below instead of + // clearing the byte in-register. + low_g.Store(new_metadata, group_index); + low_g.Store(new_metadata, group_index | old_size); + } + for (ssize_t byte_index : present_matched_range) { + ++count; + ssize_t old_index = group_index + byte_index; + if constexpr (!MetadataGroup::FastByteClear) { + CARBON_DCHECK(new_metadata[old_index] == old_metadata[old_index]); + CARBON_DCHECK(new_metadata[old_index | old_size] == + old_metadata[old_index]); + } + HashCode hash = + key_context.HashKey(old_entries[old_index].key(), ComputeSeed()); + ssize_t old_hash_index = hash.ExtractIndexAndTag<7>().first & + ComputeProbeMaskFromSize(old_size); + if (LLVM_UNLIKELY(old_hash_index != group_index)) { + probed_indices.push_back(old_index); + if constexpr (MetadataGroup::FastByteClear) { + low_g.ClearByte(byte_index); + high_g.ClearByte(byte_index); + } else { + new_metadata[old_index] = MetadataGroup::Empty; + new_metadata[old_index | old_size] = MetadataGroup::Empty; + } + continue; + } + ssize_t new_index = hash.ExtractIndexAndTag<7>().first & + ComputeProbeMaskFromSize(new_size); + CARBON_DCHECK(new_index == old_hash_index || + new_index == (old_hash_index | old_size)); + // Toggle the newly added bit of the index to get to the other possible + // target index. + if constexpr (MetadataGroup::FastByteClear) { + (new_index == old_hash_index ? high_g : low_g).ClearByte(byte_index); + new_index += byte_index; + } else { + new_index += byte_index; + new_metadata[new_index ^ old_size] = MetadataGroup::Empty; + } + + // If we need to explicitly move (and destroy) the key or value, do so + // here where we already know its target. + if constexpr (!EntryT::IsTriviallyRelocatable) { + new_entries[new_index].MoveFrom(std::move(old_entries[old_index])); + } + } + if constexpr (MetadataGroup::FastByteClear) { + low_g.Store(new_metadata, group_index); + high_g.Store(new_metadata, (group_index | old_size)); + } + } + CARBON_DCHECK((count - static_cast(probed_indices.size())) == + (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size), + MetadataGroup::Empty))); +#ifndef NDEBUG + CARBON_DCHECK((debug_empty_count + debug_deleted_count) == + (old_size - count)); + CARBON_DCHECK(llvm::count(llvm::ArrayRef(new_metadata, new_size), + MetadataGroup::Empty) == + debug_empty_count + debug_deleted_count + + static_cast(probed_indices.size()) + old_size); +#endif + + // If the keys or values are trivially relocatable, we do a bulk memcpy of + // them into place. This will copy them into both possible locations, which is + // fine. One will be empty and clobbered if reused or ignored. The other will + // be the one used. This might seem like it needs it to be valid for us to + // create two copies, but it doesn't. This produces the exact same storage as + // copying the storage into the wrong location first, and then again into the + // correct location. Only one is live and only one is destroyed. + if constexpr (EntryT::IsTriviallyRelocatable) { + memcpy(new_entries, old_entries, old_size * sizeof(EntryT)); + memcpy(new_entries + old_size, old_entries, old_size * sizeof(EntryT)); + } + + // We then need to do a normal insertion for anything that was probed before + // growth, but we know we'll find an empty slot, so leverage that. + for (ssize_t old_index : probed_indices) { + EntryT* new_entry = + InsertIntoEmpty(old_entries[old_index].key(), key_context); + new_entry->MoveFrom(std::move(old_entries[old_index])); + } + CARBON_DCHECK(count == + (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size), + MetadataGroup::Empty))); + growth_budget_ -= count; + CARBON_DCHECK(growth_budget_ == + (GrowthThresholdForAllocSize(new_size) - + (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size), + MetadataGroup::Empty)))); + CARBON_DCHECK(growth_budget_ > 0 && + "Must still have a growth budget after rehash!"); + + if (!old_small) { + // Old isn't a small buffer, so we need to deallocate it. + Deallocate(old_storage, old_size); + } + + // And lastly insert the lookup_key into an index in the newly grown map and + // return that index for use. + --growth_budget_; + return InsertIntoEmpty(lookup_key, key_context); +} + +template +TableImpl::TableImpl(const TableImpl& arg) + : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) { + CARBON_DCHECK(arg.small_alloc_size_ == SmallSize); + + ssize_t local_size = arg.alloc_size(); + + if (SmallSize > 0 && arg.is_small()) { + CARBON_DCHECK(local_size == SmallSize); + this->storage() = small_storage(); + } else { + this->storage() = BaseT::Allocate(local_size); + } + + // Preserve which slot every entry is in, including tombstones in the + // metadata, in order to copy into the new table's storage without rehashing + // all of the keys. This is especially important as we don't have an easy way + // to access the key context needed for rehashing here. + uint8_t* local_metadata = this->metadata(); + EntryT* local_entries = this->entries(); + const uint8_t* local_arg_metadata = arg.metadata(); + const EntryT* local_arg_entries = arg.entries(); + memcpy(local_metadata, local_arg_metadata, local_size); + + for (ssize_t group_index = 0; group_index < local_size; + group_index += GroupSize) { + auto g = MetadataGroup::Load(local_arg_metadata, group_index); + for (ssize_t byte_index : g.MatchPresent()) { + local_entries[group_index + byte_index].CopyFrom( + local_arg_entries[group_index + byte_index]); + } + } +} + +// Puts the incoming table into a moved-from state that can be destroyed or +// re-initialized but must not be used otherwise. +template +TableImpl::TableImpl(TableImpl&& arg) noexcept + : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) { + CARBON_DCHECK(arg.small_alloc_size_ == SmallSize); + + ssize_t local_size = arg.alloc_size(); + + if (SmallSize > 0 && arg.is_small()) { + CARBON_DCHECK(local_size == SmallSize); + this->storage() = small_storage(); + + // For small tables, we have to move the entries as we can't move the tables + // themselves. We do this preserving their slots and even tombstones to + // avoid rehashing. + uint8_t* local_metadata = this->metadata(); + EntryT* local_entries = this->entries(); + uint8_t* local_arg_metadata = arg.metadata(); + EntryT* local_arg_entries = arg.entries(); + memcpy(local_metadata, local_arg_metadata, local_size); + if (EntryT::IsTriviallyRelocatable) { + memcpy(local_entries, local_arg_entries, SmallSize * sizeof(EntryT)); + } else { + for (ssize_t group_index = 0; group_index < local_size; + group_index += GroupSize) { + auto g = MetadataGroup::Load(local_arg_metadata, group_index); + for (ssize_t byte_index : g.MatchPresent()) { + local_entries[group_index + byte_index].MoveFrom( + std::move(local_arg_entries[group_index + byte_index])); + } + } + } + } else { + // Just point to the allocated storage. + this->storage() = arg.storage(); + } + + // Finally, put the incoming table into a moved-from state. + arg.alloc_size() = 0; + // Replace the pointer with null to ease debugging. + arg.storage() = nullptr; +} + +// Reset a table to its original state, including releasing any allocated +// memory. +template +auto TableImpl::ResetImpl() -> void { + this->Destroy(); + + // Re-initialize the whole thing. + CARBON_DCHECK(this->small_alloc_size() == SmallSize); + this->Construct(small_storage()); +} + +template +auto TableImpl::small_storage() const -> Storage* { + if constexpr (SmallSize > 0) { + // Do a bunch of validation of the small size to establish our invariants + // when we know we have a non-zero small size. + static_assert(llvm::isPowerOf2_64(SmallSize), + "SmallSize must be a power of two for a hashed buffer!"); + static_assert( + SmallSize >= MaxGroupSize, + "We require all small sizes to multiples of the largest group " + "size supported to ensure it can be used portably. "); + static_assert( + (SmallSize % MaxGroupSize) == 0, + "Small size must be a multiple of the max group size supported " + "so that we can allocate a whole number of groups."); + // Implied by the max asserts above. + static_assert(SmallSize >= GroupSize); + static_assert((SmallSize % GroupSize) == 0); + + static_assert(SmallSize >= alignof(StorageEntry), + "Requested a small size that would require padding between " + "metadata bytes and correctly aligned key and value types. " + "Either a larger small size or a zero small size and heap " + "allocation are required for this key and value type."); + + static_assert(offsetof(SmallStorage, entries) == SmallSize, + "Offset to entries in small size storage doesn't match " + "computed offset!"); + + return &small_storage_; + } else { + static_assert( + sizeof(TableImpl) == sizeof(BaseT), + "Empty small storage caused a size difference and wasted space!"); + + return nullptr; + } +} + +} // namespace Carbon::RawHashtable + +#endif // CARBON_COMMON_RAW_HASHTABLE_H_ diff --git a/common/raw_hashtable_benchmark_helpers.cpp b/common/raw_hashtable_benchmark_helpers.cpp new file mode 100644 index 0000000000000..ccb82d2919b4b --- /dev/null +++ b/common/raw_hashtable_benchmark_helpers.cpp @@ -0,0 +1,383 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "common/raw_hashtable_benchmark_helpers.h" + +#include +#include + +namespace Carbon::RawHashtable { + +// A local shuffle implementation built on Abseil to improve performance in +// debug builds. +template +static auto Shuffle(llvm::MutableArrayRef data, absl::BitGen& gen) { + for (ssize_t i : llvm::seq(0, data.size() - 1)) { + ssize_t j = absl::Uniform(gen, 0, data.size() - i); + if (j != 0) { + std::swap(data[i], data[i + j]); + } + } +} + +constexpr ssize_t NumChars = 64; +static_assert(llvm::isPowerOf2_64(NumChars)); + +// For benchmarking, we use short strings in a fixed distribution with common +// characters. Real-world strings aren't uniform across ASCII or Unicode, etc. +// And for *micro*-benchmarking we want to focus on the map overhead with short, +// fast keys. +static auto MakeChars() -> llvm::OwningArrayRef { + llvm::OwningArrayRef characters(NumChars); + + // Start with `-` and `_`, and then add `a` - `z`, `A` - `Z`, and `0` - `9`. + characters[0] = '-'; + characters[1] = '_'; + ssize_t i = 2; + for (auto range : + {llvm::seq_inclusive('a', 'z'), llvm::seq_inclusive('A', 'Z'), + llvm::seq_inclusive('0', '9')}) { + for (char c : range) { + characters[i] = c; + ++i; + } + } + CARBON_CHECK(i == NumChars) << "Expected exactly " << NumChars + << " characters, got " << i << " instead!"; + return characters; +} + +constexpr ssize_t NumFourCharStrs = NumChars * NumChars * NumChars * NumChars; +static_assert(llvm::isPowerOf2_64(NumFourCharStrs)); + +// Compute every 4-character string in a shuffled array. This is a little memory +// intense -- 64 MiB -- but ends up being much cheaper by letting us reliably +// select a unique 4-character sequence to avoid collisions. +static auto MakeFourCharStrs(llvm::ArrayRef characters, absl::BitGen& gen) + -> llvm::OwningArrayRef> { + constexpr ssize_t NumCharsMask = NumChars - 1; + constexpr ssize_t NumCharsShift = llvm::CTLog2(); + llvm::OwningArrayRef> four_char_strs(NumFourCharStrs); + for (auto [i, str] : llvm::enumerate(four_char_strs)) { + str[0] = characters[i & NumCharsMask]; + i >>= NumCharsShift; + str[1] = characters[i & NumCharsMask]; + i >>= NumCharsShift; + str[2] = characters[i & NumCharsMask]; + i >>= NumCharsShift; + CARBON_CHECK((i & ~NumCharsMask) == 0); + str[3] = characters[i]; + } + Shuffle(four_char_strs, gen); + return four_char_strs; +} + +constexpr ssize_t NumRandomChars = static_cast(64) * 1024; + +// Create a pool of random characters to sample from rather than computing this +// for every string which is very slow in debug builds. We also pad this pool +// with the max length so we can pull the full length from the end to simplify +// the logic when wrapping around the pool. +static auto MakeRandomChars(llvm::ArrayRef characters, int max_length, + absl::BitGen& gen) -> llvm::OwningArrayRef { + llvm::OwningArrayRef random_chars(NumRandomChars + max_length); + for (char& c : random_chars) { + c = characters[absl::Uniform(gen, 0, NumChars)]; + } + return random_chars; +} + +// Make a small vector of pointers into a single allocation of raw strings. The +// allocated memory is expected to leak and must be transitively referenced by a +// global. Each string has `length` size (which must be >= 4), and there are +// `key_count` keys in the result. Each key is filled from the `random_chars` +// until the last 4 characters. The last four characters of each string will be +// taken sequentially from `four_char_strs` from some random start position to +// ensure no duplicate keys are produced. +static auto MakeRawStrKeys(ssize_t length, ssize_t key_count, + llvm::ArrayRef> four_char_strs, + llvm::ArrayRef random_chars, absl::BitGen& gen) + -> llvm::SmallVector { + llvm::SmallVector raw_keys; + CARBON_CHECK(length >= 4); + ssize_t prefix_length = length - 4; + + // Select a random start for indexing our four character strings. + ssize_t four_char_index = absl::Uniform(gen, 0, NumFourCharStrs); + + // Select a random start for the prefix random characters. + ssize_t random_chars_index = absl::Uniform(gen, 0, NumRandomChars); + + // Do a single memory allocation for all the keys of this length to + // avoid an excessive number of small and fragmented allocations. This + // memory is intentionally leaked as the keys are global and will + // themselves will point into it. + char* key_text = new char[key_count * length]; + + // Reserve all the key space since we know how many we'll need. + raw_keys.reserve(key_count); + for ([[gnu::unused]] ssize_t i : llvm::seq(0, key_count)) { + memcpy(key_text, random_chars.data() + random_chars_index, prefix_length); + random_chars_index += prefix_length; + random_chars_index &= NumRandomChars - 1; + // Set the last four characters with this entry in the shuffled + // sequence. + memcpy(key_text + prefix_length, four_char_strs[four_char_index].data(), 4); + // Step through the shuffled sequence. We start at a random position, + // so we need to wrap around the end. + ++four_char_index; + four_char_index &= NumFourCharStrs - 1; + + // And finally save the start pointer as one of our raw keys. + raw_keys.push_back(key_text); + key_text += length; + } + return raw_keys; +} + +// Build up a large collection of random and unique string keys. This is +// actually a relatively expensive operation due to needing to build all the +// random string text. As a consequence, the initializer of this global is +// somewhat performance tuned to ensure benchmarks don't take an excessive +// amount of time to run or use an excessive amount of memory. +static absl::NoDestructor> raw_str_keys{ + [] { + llvm::OwningArrayRef keys(MaxNumKeys); + absl::BitGen gen; + + std::array length_buckets = { + 4, 4, 4, 4, 5, 5, 5, 5, 7, 7, 10, 10, 15, 25, 40, 80, + }; + static_assert((MaxNumKeys % length_buckets.size()) == 0); + CARBON_CHECK(llvm::is_sorted(length_buckets)); + + // For each distinct length bucket, we build a vector of raw keys. + std::forward_list> raw_keys_storage; + // And a parallel array to the length buckets with the raw keys of that + // length. + std::array*, length_buckets.size()> + raw_keys_buckets; + + llvm::OwningArrayRef characters = MakeChars(); + llvm::OwningArrayRef> four_char_strs = + MakeFourCharStrs(characters, gen); + llvm::OwningArrayRef random_chars = MakeRandomChars( + characters, /*max_length=*/length_buckets.back(), gen); + + ssize_t prev_length = -1; + for (auto [length_index, length] : llvm::enumerate(length_buckets)) { + // We can detect repetitions in length as they are sorted. + if (length == prev_length) { + raw_keys_buckets[length_index] = raw_keys_buckets[length_index - 1]; + continue; + } + prev_length = length; + + // We want to compute all the keys of this length that we'll need. + ssize_t key_count = (MaxNumKeys / length_buckets.size()) * + llvm::count(length_buckets, length); + + raw_keys_buckets[length_index] = + &raw_keys_storage.emplace_front(MakeRawStrKeys( + length, key_count, four_char_strs, random_chars, gen)); + } + + // Now build the actual key array from our intermediate storage by + // round-robin extracting from the length buckets. + for (auto [index, key] : llvm::enumerate(keys)) { + ssize_t bucket = index % length_buckets.size(); + ssize_t length = length_buckets[bucket]; + // We pop a raw key from the list of them associated with this bucket. + const char* raw_key = raw_keys_buckets[bucket]->pop_back_val(); + // And build our key from that. + key = llvm::StringRef(raw_key, length); + } + // Check that in fact we popped every raw key into our main keys. + for (const auto& raw_keys : raw_keys_storage) { + CARBON_CHECK(raw_keys.empty()); + } + return keys; + }()}; + +static absl::NoDestructor> raw_ptr_keys{[] { + llvm::OwningArrayRef keys(MaxNumKeys); + for (auto [index, key] : llvm::enumerate(keys)) { + // We leak these pointers -- this is a static initializer executed once. + key = new int(static_cast(index)); + } + return keys; +}()}; + +static absl::NoDestructor> raw_int_keys{[] { + llvm::OwningArrayRef keys(MaxNumKeys); + for (auto [index, key] : llvm::enumerate(keys)) { + key = index + 1; + } + return keys; +}()}; + +namespace { + +// Allow generically dispatching over the specific key types we build and +// support. +template +auto GetRawKeys() -> llvm::ArrayRef { + if constexpr (std::is_same_v) { + return *raw_str_keys; + } else if constexpr (std::is_pointer_v) { + return *raw_ptr_keys; + } else { + return *raw_int_keys; + } +} + +template +static absl::NoDestructor< + std::map, llvm::OwningArrayRef>> + lookup_keys_storage; + +// Given a particular table keys size and lookup keys size, provide an array ref +// to a shuffled set of lookup keys. +// +// Because different table sizes pull from different sub-ranges of our raw keys, +// we need to compute a distinct set of random keys in the table to use for +// lookups depending on the table size. And we also want to have an even +// distribution of key *sizes* throughout the lookup keys, and so we can't +// compute a single lookup keys array of the maximum size. Instead we need to +// compute a distinct special set of lookup keys for each pair of table and +// lookup size, and then shuffle that specific set into a random sequence that +// is returned. This function memoizes this sequence for each pair of sizes. +template +auto GetShuffledLookupKeys(ssize_t table_keys_size, ssize_t lookup_keys_size) + -> llvm::ArrayRef { + // The raw keys aren't shuffled and round-robin through the sizes. We want to + // keep the total size of lookup keys used exactly the same across runs. So + // for a given size we always take the leading sequence from the raw keys for + // that size, duplicating as needed to get the desired lookup sequence size, + // and then shuffle the keys in that sequence to end up with a random sequence + // of keys. We store each of these shuffled sequences in a map to avoid + // repeatedly computing them. + llvm::OwningArrayRef& lookup_keys = + (*lookup_keys_storage)[{table_keys_size, lookup_keys_size}]; + if (lookup_keys.empty()) { + lookup_keys = llvm::OwningArrayRef(lookup_keys_size); + auto raw_keys = GetRawKeys(); + for (auto [index, key] : llvm::enumerate(lookup_keys)) { + key = raw_keys[index % table_keys_size]; + } + absl::BitGen gen; + Shuffle(lookup_keys, gen); + } + CARBON_CHECK(static_cast(lookup_keys.size()) == lookup_keys_size); + + return lookup_keys; +} + +} // namespace + +template +auto GetKeysAndMissKeys(ssize_t table_keys_size) + -> std::pair, llvm::ArrayRef> { + CARBON_CHECK(table_keys_size <= MaxNumKeys); + // The raw keys aren't shuffled and round-robin through the sizes. Take the + // tail of this sequence and shuffle it to form a random set of miss keys with + // a consistent total size. + static absl::NoDestructor> miss_keys{[] { + llvm::OwningArrayRef keys; + keys = GetRawKeys().take_back(NumOtherKeys); + CARBON_CHECK(keys.size() == NumOtherKeys); + absl::BitGen gen; + Shuffle(keys, gen); + return keys; + }()}; + + return {GetRawKeys().slice(0, table_keys_size), *miss_keys}; +} +template auto GetKeysAndMissKeys(ssize_t size) + -> std::pair, llvm::ArrayRef>; +template auto GetKeysAndMissKeys(ssize_t size) + -> std::pair, llvm::ArrayRef>; +template auto GetKeysAndMissKeys(ssize_t size) + -> std::pair, + llvm::ArrayRef>; + +template +auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size) + -> std::pair, llvm::ArrayRef> { + CARBON_CHECK(table_keys_size <= MaxNumKeys); + CARBON_CHECK(lookup_keys_size <= MaxNumKeys); + return {GetRawKeys().slice(0, table_keys_size), + GetShuffledLookupKeys(table_keys_size, lookup_keys_size)}; +} +template auto GetKeysAndHitKeys(ssize_t size, ssize_t lookup_keys_size) + -> std::pair, llvm::ArrayRef>; +template auto GetKeysAndHitKeys(ssize_t size, ssize_t lookup_keys_size) + -> std::pair, llvm::ArrayRef>; +template auto GetKeysAndHitKeys(ssize_t size, + ssize_t lookup_keys_size) + -> std::pair, + llvm::ArrayRef>; + +template +auto DumpHashStatistics(llvm::ArrayRef keys) -> void { + if (keys.size() < GroupSize) { + return; + } + + // The hash table load factor is 7/8ths, so we want to add 1/7th of our + // current size, subtract one, and pick the next power of two to get the power + // of two where 7/8ths is greater than or equal to the incoming key size. + ssize_t expected_size = + llvm::NextPowerOf2(keys.size() + (keys.size() / 7) - 1); + + constexpr int GroupShift = llvm::CTLog2(); + + size_t mask = ComputeProbeMaskFromSize(expected_size); + uint64_t salt = ComputeSeed(); + auto get_hash_index = [mask, salt](auto x) -> ssize_t { + auto [hash_index, _] = HashValue(x, salt).template ExtractIndexAndTag<7>(); + return (hash_index & mask) >> GroupShift; + }; + + std::vector> grouped_key_indices(expected_size >> + GroupShift); + for (auto [i, k] : llvm::enumerate(keys)) { + ssize_t hash_index = get_hash_index(k); + CARBON_CHECK(hash_index < (expected_size >> GroupShift)) << hash_index; + grouped_key_indices[hash_index].push_back(i); + } + ssize_t max_group_index = + std::max_element(grouped_key_indices.begin(), grouped_key_indices.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.size() < rhs.size(); + }) - + grouped_key_indices.begin(); + + // If the max number of collisions on the index is less than or equal to the + // group size, there shouldn't be any necessary probing (outside of deletion) + // and so this isn't interesting, skip printing. + if (grouped_key_indices[max_group_index].size() <= GroupSize) { + return; + } + + llvm::errs() << "keys: " << keys.size() + << " groups: " << grouped_key_indices.size() << "\n" + << "max group index: " << llvm::formatv("{0x8}", max_group_index) + << " collisions: " + << grouped_key_indices[max_group_index].size() << "\n"; + + for (auto i : llvm::ArrayRef(grouped_key_indices[max_group_index]) + .take_front(2 * GroupSize)) { + auto k = keys[i]; + auto hash = static_cast(HashValue(k, salt)); + llvm::errs() << " key: " << k + << " salt: " << llvm::formatv("{0:x16}", salt) + << " hash: " << llvm::formatv("{0:x16}", hash) << "\n"; + } +} +template auto DumpHashStatistics(llvm::ArrayRef keys) -> void; +template auto DumpHashStatistics(llvm::ArrayRef keys) -> void; +template auto DumpHashStatistics(llvm::ArrayRef keys) -> void; + +} // namespace Carbon::RawHashtable diff --git a/common/raw_hashtable_benchmark_helpers.h b/common/raw_hashtable_benchmark_helpers.h new file mode 100644 index 0000000000000..b5566dfb062e8 --- /dev/null +++ b/common/raw_hashtable_benchmark_helpers.h @@ -0,0 +1,208 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_ +#define CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_ + +#include +#include + +#include +#include +#include + +#include "absl/base/no_destructor.h" +#include "absl/random/random.h" +#include "common/check.h" +#include "common/hashing.h" +#include "common/raw_hashtable.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +namespace Carbon::RawHashtable { + +// We want to support benchmarking with 16M keys plus up to 256 "other" keys +// (for misses). The large number of keys helps check for performance hiccups +// with especially large tables and when missing all levels of cache. +inline constexpr ssize_t NumOtherKeys = 1 << 8; +inline constexpr ssize_t MaxNumKeys = (1 << 24) + NumOtherKeys; + +// Get an array of main keys with the given `size`, which must be less than +// 2^24. Also get a miss keys array of `NumOtherKeys` which has no collisions +// with the main keys. +// +// For a given size, this will return the same arrays. This uses unsynchronized +// global state, and so is thread hostile and must not be called before main. +template +auto GetKeysAndMissKeys(ssize_t table_keys_size) + -> std::pair, llvm::ArrayRef>; + +// Get an array of main keys with the given `size`, which must be less than +// 2^24. Also get a hit keys array of `lookup_keys_size` all of which will occur +// in the may keys array. If the lookup size is larger than the main size, the +// lookup sequence will contain duplicates. +// +// For a given size, this will return the same arrays. This uses unsynchronized +// global state, and so is thread hostile and must not be called before main. +template +auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size) + -> std::pair, llvm::ArrayRef>; + +// Dump statistics about hashing the given keys. +template +auto DumpHashStatistics(llvm::ArrayRef keys) -> void; + +// Convert values used in hashtable benchmarking to a bool. This is used to form +// dependencies between values stored in the hashtable between benchmark +// iterations. +template +auto ValueToBool(T value) -> bool { + if constexpr (std::is_same_v) { + return value.size() > 0; + } else if constexpr (std::is_pointer_v) { + return value != nullptr; + } else { + // We want our keys to include `0` for integers, so use the largest value. + return value != std::numeric_limits::max(); + } +} + +inline auto SizeArgs(benchmark::internal::Benchmark* b) -> void { + // Benchmarks for "miss" operations only have one parameter -- the size of the + // table. These benchmarks use a fixed `NumOtherKeys` set of extra keys for + // each miss operation. + b->DenseRange(1, 4, 1); + b->Arg(8); + b->Arg(16); + b->Arg(32); + + // For sizes >= 64 we first use the power of two which will have a low load + // factor, and then target exactly at our max load factor. + auto large_sizes = {64, 1 << 8, 1 << 12, 1 << 16, 1 << 20, 1 << 24}; + for (auto s : large_sizes) { + b->Arg(s); + } + for (auto s : large_sizes) { + b->Arg(s - (s / 8)); + } +} + +inline auto HitArgs(benchmark::internal::Benchmark* b) -> void { + // There are two parameters for benchmarks of "hit" operations. The first is + // the size of the hashtable itself. The second is the size of a buffer of + // random keys actually in the hashtable to use for the operations. + // + // For small sizes, we use a fixed `NumOtherKeys` lookup key count. This is + // enough to avoid patterns of queries training the branch predictor just from + // the keys themselves, while small enough to avoid significant L1 cache + // pressure. + b->ArgsProduct({benchmark::CreateDenseRange(1, 4, 1), {NumOtherKeys}}); + b->Args({8, NumOtherKeys}); + b->Args({16, NumOtherKeys}); + b->Args({32, NumOtherKeys}); + + // For sizes >= 64 we first use the power of two which will have a low load + // factor, and then target exactly at our max load factor. Start the sizes + // list off with the powers of two, and the append a version of each power of + // two adjusted down to the load factor. We'll then build the benchmarks from + // these below. + std::vector large_sizes = {64, 1 << 8, 1 << 12, + 1 << 16, 1 << 20, 1 << 24}; + for (auto i : llvm::seq(0, large_sizes.size())) { + ssize_t s = large_sizes[i]; + large_sizes.push_back(s - (s / 8)); + } + + for (auto s : large_sizes) { + b->Args({s, NumOtherKeys}); + + // Once the sizes are more than 4x the `NumOtherKeys` minimum lookup buffer + // size, also include 25% and 50% lookup buffer sizes which will + // increasingly exhaust the ability to keep matching entries in the cache. + if (s >= NumOtherKeys) { + b->Args({s, s / 4}); + b->Args({s, s / 2}); + } + } +} + +// Provide some Dense{Map,Set}Info viable implementations for the key types +// using Carbon's hashing framework. These let us benchmark the data structure +// alone rather than the combination of data structure and hashing routine. +// +// We only provide these for benchmarking -- they are *not* necessarily suitable +// for broader use. The Carbon hashing infrastructure has only been evaluated in +// the context of its specific hashtable design. +template +struct CarbonHashDI; + +template <> +struct CarbonHashDI { + static auto getEmptyKey() -> int { return -1; } + static auto getTombstoneKey() -> int { return -2; } + static auto getHashValue(const int val) -> unsigned { + return static_cast(HashValue(val)); + } + static auto isEqual(const int lhs, const int rhs) -> bool { + return lhs == rhs; + } +}; + +template +struct CarbonHashDI { + static constexpr uintptr_t Log2MaxAlign = 12; + + static auto getEmptyKey() -> T* { + auto val = static_cast(-1); + val <<= Log2MaxAlign; + // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API. + return reinterpret_cast(val); + } + + static auto getTombstoneKey() -> T* { + auto val = static_cast(-2); + val <<= Log2MaxAlign; + // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API. + return reinterpret_cast(val); + } + + static auto getHashValue(const T* ptr_val) -> unsigned { + return static_cast(HashValue(ptr_val)); + } + + static auto isEqual(const T* lhs, const T* rhs) -> bool { return lhs == rhs; } +}; + +template <> +struct CarbonHashDI { + static auto getEmptyKey() -> llvm::StringRef { + return llvm::StringRef( + // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API. + reinterpret_cast(~static_cast(0)), 0); + } + + static auto getTombstoneKey() -> llvm::StringRef { + return llvm::StringRef( + // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API. + reinterpret_cast(~static_cast(1)), 0); + } + static auto getHashValue(llvm::StringRef val) -> unsigned { + return static_cast(HashValue(val)); + } + static auto isEqual(llvm::StringRef lhs, llvm::StringRef rhs) -> bool { + if (rhs.data() == getEmptyKey().data()) { + return lhs.data() == getEmptyKey().data(); + } + if (rhs.data() == getTombstoneKey().data()) { + return lhs.data() == getTombstoneKey().data(); + } + return lhs == rhs; + } +}; + +} // namespace Carbon::RawHashtable + +#endif // CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_ diff --git a/common/raw_hashtable_metadata_group.cpp b/common/raw_hashtable_metadata_group.cpp new file mode 100644 index 0000000000000..1d0e4de9a9227 --- /dev/null +++ b/common/raw_hashtable_metadata_group.cpp @@ -0,0 +1,20 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "common/raw_hashtable_metadata_group.h" + +#include "llvm/ADT/StringExtras.h" + +namespace Carbon::RawHashtable { + +auto MetadataGroup::Print(llvm::raw_ostream& out) const -> void { + out << "["; + llvm::ListSeparator sep; + for (uint8_t byte : metadata_bytes) { + out << sep << llvm::formatv("{0:x2}", byte); + } + out << "]"; +} + +} // namespace Carbon::RawHashtable diff --git a/common/raw_hashtable_metadata_group.h b/common/raw_hashtable_metadata_group.h new file mode 100644 index 0000000000000..497db6009388b --- /dev/null +++ b/common/raw_hashtable_metadata_group.h @@ -0,0 +1,1093 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_ +#define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_ + +#include +#include +#include + +#include "common/check.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/bit.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MathExtras.h" + +// Detect whether we can use SIMD accelerated implementations of the control +// groups, and include the relevant platform specific APIs for the SIMD +// implementations. +// +// Reference documentation for the SIMD APIs used here: +// - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html +// - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html +#if defined(__SSSE3__) +#include +#define CARBON_X86_SIMD_SUPPORT 1 +#elif defined(__ARM_NEON) +#include +#define CARBON_NEON_SIMD_SUPPORT 1 +#endif + +// This namespace collects low-level utilities for implementing hashtable +// data structures. This file only provides one of them: +// +// - Primitives to manage "groups" of hashtable entries that have densely packed +// control bytes we can scan rapidly as a group, often using SIMD facilities +// to process the entire group at once. +namespace Carbon::RawHashtable { + +// We define a constant max group size. The particular group size used in +// practice may vary, but we want to have some upper bound used to ensure +// memory allocation is done consistently across different architectures. +constexpr ssize_t MaxGroupSize = 16; + +// This takes a collection of bits representing the results of looking for a +// particular tag in this metadata group and determines the first position with +// a match. The position is represented by either the least significant set bit +// or the least significant non-zero byte, depending on `ByteEncoding`. When +// represented with a non-zero byte, that byte must have at least its most +// significant bit set, but may have other bits set to any value. Bits more +// significant than the match may have any value provided there is at least one +// match. Zero matches must be represented by a zero input. +// +// Some bits of the underlying value may be known-zero, which can optimize +// various operations. These can be represented as a `ZeroMask`. +template +class BitIndex + : public Printable> { + public: + using BitsT = BitsInputT; + + BitIndex() = default; + explicit BitIndex(BitsT bits) : bits_(bits) {} + + friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool { + if (lhs.empty() || rhs.empty()) { + return lhs.empty() == rhs.empty(); + } + // For non-empty bit indices, compare the indices directly to ignore other + // (extraneous) parts of the incoming bits. + return lhs.index() == rhs.index(); + } + + auto Print(llvm::raw_ostream& out) const -> void { + out << llvm::formatv("{0:x}", bits_); + } + + explicit operator bool() const { return !empty(); } + + // Returns true when there are no matches for the tag. + auto empty() const -> bool { + CARBON_DCHECK((bits_ & ZeroMask) == 0) << "Unexpected non-zero bits!"; + __builtin_assume((bits_ & ZeroMask) == 0); + return bits_ == 0; + } + + // Returns the index of the first matched tag. + auto index() -> ssize_t { + CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!"; + __builtin_assume(bits_ != 0); + ssize_t index = unscaled_index(); + + if constexpr (ByteEncoding) { + // Shift to scale out of the byte encoding. + index >>= ByteEncodingShift; + } + + return index; + } + + // Optimized tool to index a pointer `p` by `index()`. + template + auto index_ptr(T* pointer) -> T* { + CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!"; + __builtin_assume(bits_ != 0); + if constexpr (!ByteEncoding) { + return &pointer[unscaled_index()]; + } + + ssize_t index = unscaled_index(); + + // Scale the index as we counted zero *bits* and not zero *bytes*. + // However, we can fold that scale with the size of `T` when it is a power + // of two or divisible by 8. + CARBON_DCHECK( + (index & ((static_cast(1) << ByteEncodingShift) - 1)) == 0); + if constexpr (sizeof(T) % 8 == 0) { + constexpr size_t FoldedScale = sizeof(T) / 8; + index *= FoldedScale; + return reinterpret_cast( + &reinterpret_cast(pointer)[index]); + } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) { + constexpr size_t ScaleShift = llvm::CTLog2(); + static_assert(ScaleShift <= ByteEncodingShift, + "Scaling by >=8 should be handled above!"); + constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift; + index >>= FoldedShift; + return reinterpret_cast( + &reinterpret_cast(pointer)[index]); + } + + // Nothing we can fold here. + return &pointer[index >> ByteEncodingShift]; + } + + private: + // When using a byte encoding, we'll need to shift any index by this amount. + static constexpr size_t ByteEncodingShift = 3; + + auto unscaled_index() -> ssize_t { + if constexpr (!ByteEncoding) { + // Note the cast to `size_t` to force zero extending the result. + return static_cast(llvm::countr_zero(bits_)); + } else { + // The index is encoded in the high bit of each byte. We compute the index + // by counting the number of low zero bytes there are before the first + // byte with its high bit set. Rather that shifting the high bit to be the + // low bit and counting the trailing (least significant) zero bits + // directly, we instead byte-reverse the bits and count the *leading* + // (most significant) zero bits. While this may be a wash on CPUs with + // direct support for counting the trailing zero bits, AArch64 only + // supports counting the leading zero bits and requires a bit-reverse to + // count the trailing zero bits. Doing the byte-reverse approach + // essentially combines moving the high bit into the low bit and the + // reverse necessary for counting the zero bits. While this only removes + // one instruction, it is an instruction in the critical path of the + // hottest part of table lookup, and that critical path dependency height + // is few enough instructions that removing even one significantly impacts + // latency. + // + // We also cast to `size_t` to clearly zero-extend the result. + return static_cast(llvm::countl_zero(llvm::byteswap(bits_))); + } + } + + BitsT bits_ = 0; +}; + +// This is like `BitIndex`, but allows iterating through all of the matches. +// +// A key requirement for efficient iteration is that all of the matches are +// represented with a single bit and there are no other bits set. For example, +// with byte-encoded bit indices, exactly the high bit and no other bit of each +// matching byte must be set. This is a stricter constraint than what `BitIndex` +// alone would impose on any one of the matches. +template +class BitIndexRange : public Printable> { + public: + using BitsT = BitIndexT::BitsT; + + class Iterator + : public llvm::iterator_facade_base { + public: + Iterator() = default; + explicit Iterator(BitsT bits) : bits_(bits) {} + + auto operator==(const Iterator& rhs) const -> bool { + return bits_ == rhs.bits_; + } + + auto operator*() -> ssize_t& { + CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!"; + __builtin_assume(bits_ != 0); + index_ = BitIndexT(bits_).index(); + // Note that we store the index in a member so we can return a reference + // to it here as required to be a forward iterator. + return index_; + } + + template + auto index_ptr(T* pointer) -> T* { + return BitIndexT(bits_).index_ptr(pointer); + } + + auto operator++() -> Iterator& { + CARBON_DCHECK(bits_ != 0) << "Must not increment past the end!"; + __builtin_assume(bits_ != 0); + // Clears the least significant set bit, effectively stepping to the next + // match. + bits_ &= (bits_ - 1); + return *this; + } + + private: + ssize_t index_; + BitsT bits_ = 0; + }; + + BitIndexRange() = default; + explicit BitIndexRange(BitsT bits) : bits_(bits) {} + + explicit operator bool() const { return !empty(); } + auto empty() const -> bool { return BitIndexT(bits_).empty(); } + + auto begin() const -> Iterator { return Iterator(bits_); } + auto end() const -> Iterator { return Iterator(); } + + friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool { + return lhs.bits_ == rhs.bits_; + } + + auto Print(llvm::raw_ostream& out) const -> void { + out << llvm::formatv("{0:x}", bits_); + } + + explicit operator BitsT() const { return bits_; } + explicit operator BitIndexT() const { return BitIndexT(bits_); } + + private: + BitsT bits_ = 0; +}; + +// A group of metadata bytes that can be manipulated together. +// +// The metadata bytes used Carbon's hashtable implementation are designed to +// support being manipulating as groups, either using architecture specific SIMD +// code sequences or using portable SIMD-in-an-integer-register code sequences. +// These operations are unusually performance sensitive and in sometimes +// surprising ways. The implementations here are crafted specifically to +// optimize the particular usages in Carbon's hashtable and should not be +// expected to be reusable in any other context. +// +// Throughout the functions operating on this type we try to use patterns with a +// fallback portable implementation which can be directly used in the absence of +// a SIMD implementation, but is also used (with the same code) to check that +// any SIMD implementation produces the same result as the portable one. These +// patterns help minimize un-compiled or un-tested paths through either portable +// or SIMD code, regardless of which path is actually *used* on a particular +// platform. To illustrate a common version of this pattern, we might have code +// like: +// +// ```cpp +// auto MetadataGroup::Operation(...) -> ... { +// ... portable_result; +// ... simd_result; +// if constexpr (!UseSIMD || DebugSIMD) { +// portable_result = PortableOperation(...); +// } +// if (UseSIMD || DebugSIMD) { +// simd_result = SIMDOperation(...) +// CARBON_DCHECK(result == portable_result) << ...; +// } +// return UseSIMD ? simd_result : portable_result; +// } +// ``` +class MetadataGroup : public Printable { + public: + static constexpr ssize_t Size = +#if CARBON_X86_SIMD_SUPPORT + 16; +#else + 8; +#endif + static_assert(Size >= 8); + static_assert(Size % 8 == 0); + static_assert(Size <= MaxGroupSize); + static_assert(MaxGroupSize % Size == 0); + static_assert(llvm::isPowerOf2_64(Size), + "The group size must be a constant power of two so dividing by " + "it is a simple shift."); + static constexpr ssize_t Mask = Size - 1; + + // Each control byte can have special values. All special values have the + // most significant bit cleared to distinguish them from the seven hash bits + // stored when the control byte represents a full bucket. + // + // Otherwise, their values are chose primarily to provide efficient SIMD + // implementations of the common operations on an entire control group. + static constexpr uint8_t Empty = 0; + static constexpr uint8_t Deleted = 1; + + static constexpr uint8_t PresentMask = 0b1000'0000; + + // Some architectures make it much more efficient to build the match indices + // in a byte-encoded form rather than a bit-encoded form. This encoding + // changes verification and other aspects of our algorithms. + static constexpr bool ByteEncoding = +#if CARBON_X86_SIMD_SUPPORT + false; +#else + true; +#endif + static_assert(!ByteEncoding || Size == 8, + "We can only support byte encoding with a group size of 8."); + + // We need to indicate to users of the metadata group when they can hold a + // group value in a "register" (local variable) across clearing of individual + // bytes in the group efficiently. If the entire group can fit in an integer + // register, this works well and clients of the group should work to use the + // already-loaded value when clearing bytes. But when we have a larger group + // size, clearing the byte will typically require storing a byte to memory and + // re-loading the group. The usage patterns that need to clear bytes can in + // those cases avoid clearing a loaded group, and clear the byte directly in + // the larger metadata array. + static constexpr bool FastByteClear = Size == 8; + + using MatchIndex = + BitIndex, + ByteEncoding, + /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>; + using MatchRange = BitIndexRange; + + union { + uint8_t metadata_bytes[Size]; + uint64_t metadata_ints[Size / 8]; +#if CARBON_NEON_SIMD_SUPPORT + uint8x8_t metadata_vec = {}; + static_assert(sizeof(metadata_vec) == Size); +#elif CARBON_X86_SIMD_SUPPORT + __m128i metadata_vec = {}; + static_assert(sizeof(metadata_vec) == Size); +#endif + }; + + auto Print(llvm::raw_ostream& out) const -> void; + + friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool { + return CompareEqual(lhs, rhs); + } + + // The main API for this class. This API will switch between a portable and + // SIMD implementation based on what is most efficient, but in debug builds + // will cross check that the implementations do not diverge. + + // Load and return a group of metadata bytes out of the main metadata array at + // a particular `index`. The index must be a multiple of `GroupSize`. This + // will arrange for the load to place the group into the correct structure for + // efficient register-based processing. + static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup; + + // Store this metadata group into the main metadata array at the provided + // `index`. The index must be a multiple of `GroupSize`. + auto Store(uint8_t* metadata, ssize_t index) const -> void; + + // Clear a byte of this group's metadata at the provided `byte_index` to the + // empty value. Note that this must only be called when `FastByteClear` is + // true -- in all other cases users of this class should arrange to clear + // individual bytes in the underlying array rather than using the group API. + auto ClearByte(ssize_t byte_index) -> void; + + // Clear all of this group's metadata bytes that indicate a deleted slot to + // the empty value. + auto ClearDeleted() -> void; + + // Find all of the bytes of metadata in this group that are present and whose + // low 7 bits match the provided `tag`. The `tag` byte must have a clear high + // bit, only 7 bits of tag are used. Note that this means the provided tag is + // *not* the actual present metadata byte -- this function is responsible for + // mapping the tag into that form as it can do so more efficiently in some + // cases. A range over all of the byte indices which matched is returned. + auto Match(uint8_t tag) const -> MatchRange; + + // Find all of the present bytes of metadata in this group. A range over all + // of the byte indices which are present is returned. + auto MatchPresent() const -> MatchRange; + + // Find the first byte of the metadata group that is empty and return that + // index. There is no order or position required for which of the bytes of + // metadata is considered "first", any model will do that makes it efficient + // to produce the matching index. Must return an empty match index if no bytes + // match the empty metadata. + auto MatchEmpty() const -> MatchIndex; + + // Find the first byte of the metadata group that is deleted and return that + // index. There is no order or position required for which of the bytes of + // metadata is considered "first", any model will do that makes it efficient + // to produce the matching index. Must return an empty match index if no bytes + // match the deleted metadata. + auto MatchDeleted() const -> MatchIndex; + + private: + // Two classes only defined in the benchmark code are allowed to directly call + // the portable and SIMD implementations for benchmarking purposes. + friend class BenchmarkPortableMetadataGroup; + friend class BenchmarkSIMDMetadataGroup; + + // Whether to use a SIMD implementation. Even when we *support* a SIMD + // implementation, we do not always have to use it in the event that it is + // less efficient than the portable version. + static constexpr bool UseSIMD = +#if CARBON_X86_SIMD_SUPPORT + true; +#else + false; +#endif + + // All SIMD variants that we have an implementation for should be enabled for + // debugging. This lets us maintain a SIMD implementation even if it is not + // used due to performance reasons, and easily re-enable it if the performance + // changes. + static constexpr bool DebugSIMD = +#if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT) + true; +#else + false; +#endif + + // Most and least significant bits set. + static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL; + static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL; + + using MatchBitsT = MatchIndex::BitsT; + + static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool; + + // Functions for validating the returned matches agree with what is predicted + // by the `byte_match` function. These either `CHECK`-fail or return true. To + // pass validation, the `*_bits` argument must have `0x80` for those bytes + // where `byte_match` returns true, and `0` for the rest. + + // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only + // promise to return accurate information up to the first match. + auto VerifyIndexBits( + MatchBitsT index_bits, + llvm::function_refbool> byte_match) const -> bool; + // `VerifyRangeBits` is for functions that return `MatchRange`, and so it + // validates all the bytes of `range_bits`. + auto VerifyRangeBits( + MatchBitsT range_bits, + llvm::function_refbool> byte_match) const -> bool; + + // Portable implementations of each operation. These are used on platforms + // without SIMD support or where the portable implementation is faster than + // SIMD. They are heavily optimized even though they are not SIMD because we + // expect there to be platforms where the portable implementation can + // outperform SIMD. Their behavior and semantics exactly match the + // documentation for the un-prefixed functions. + // + // In debug builds, these also directly verify their results to help establish + // baseline functionality. + static auto PortableLoad(const uint8_t* metadata, ssize_t index) + -> MetadataGroup; + auto PortableStore(uint8_t* metadata, ssize_t index) const -> void; + + auto PortableClearDeleted() -> void; + + auto PortableMatch(uint8_t tag) const -> MatchRange; + auto PortableMatchPresent() const -> MatchRange; + + auto PortableMatchEmpty() const -> MatchIndex; + auto PortableMatchDeleted() const -> MatchIndex; + + static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs) + -> bool; + + // SIMD implementations of each operation. We minimize platform-specific APIs + // to reduce the scope of errors that can only be discoverd building on one + // platform, so the bodies of these contain the platform specific code. Their + // behavior and semantics exactly match the documentation for the un-prefixed + // functions. + // + // These routines don't directly verify their results as we can build simpler + // debug checks by comparing them against the verified portable results. + static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup; + auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void; + + auto SIMDClearDeleted() -> void; + + auto SIMDMatch(uint8_t tag) const -> MatchRange; + auto SIMDMatchPresent() const -> MatchRange; + + auto SIMDMatchEmpty() const -> MatchIndex; + auto SIMDMatchDeleted() const -> MatchIndex; + + static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool; + +#if CARBON_X86_SIMD_SUPPORT + // A common routine for x86 SIMD matching that can be used for matching + // present, empty, and deleted bytes with equal efficiency. + auto X86SIMDMatch(uint8_t match_byte) const -> MatchRange; +#endif +}; + +// Promote the size and mask to top-level constants as we'll need to operate on +// the grouped structure outside of the metadata bytes. +inline constexpr ssize_t GroupSize = MetadataGroup::Size; +inline constexpr ssize_t GroupMask = MetadataGroup::Mask; + +inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index) + -> MetadataGroup { + MetadataGroup portable_g; + if constexpr (!UseSIMD || DebugSIMD) { + portable_g = PortableLoad(metadata, index); + if constexpr (!UseSIMD) { + return portable_g; + } + } + MetadataGroup g = SIMDLoad(metadata, index); + CARBON_DCHECK(g == portable_g); + return g; +} + +inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const + -> void { + if constexpr (!UseSIMD) { + std::memcpy(metadata + index, &metadata_bytes, Size); + } else { + SIMDStore(metadata, index); + } + CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size)); +} + +inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void { + CARBON_DCHECK(FastByteClear) << "Only use byte clearing when fast!"; + CARBON_DCHECK(Size == 8) + << "The clear implementation assumes an 8-byte group."; + + metadata_ints[0] &= ~(static_cast(0xff) << (byte_index * 8)); +} + +inline auto MetadataGroup::ClearDeleted() -> void { + MetadataGroup portable_g = *this; + MetadataGroup simd_g = *this; + if constexpr (!UseSIMD || DebugSIMD) { + portable_g.PortableClearDeleted(); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_g.SIMDClearDeleted(); + CARBON_DCHECK(simd_g == portable_g) + << "SIMD cleared group '" << simd_g + << "' doesn't match portable cleared group '" << portable_g << "'"; + } + *this = UseSIMD ? simd_g : portable_g; +} + +inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange { + // The caller should provide us with the present byte hash, and not set any + // present bit tag on it so that this layer can manage tagging the high bit of + // a present byte. + CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag); + + MatchRange portable_result; + MatchRange simd_result; + if constexpr (!UseSIMD || DebugSIMD) { + portable_result = PortableMatch(tag); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_result = SIMDMatch(tag); + CARBON_DCHECK(simd_result == portable_result) + << "SIMD result '" << simd_result << "' doesn't match portable result '" + << portable_result << "'"; + } + return UseSIMD ? simd_result : portable_result; +} + +inline auto MetadataGroup::MatchPresent() const -> MatchRange { + MatchRange portable_result; + MatchRange simd_result; + if constexpr (!UseSIMD || DebugSIMD) { + portable_result = PortableMatchPresent(); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_result = SIMDMatchPresent(); + CARBON_DCHECK(simd_result == portable_result) + << "SIMD result '" << simd_result << "' doesn't match portable result '" + << portable_result << "'"; + } + return UseSIMD ? simd_result : portable_result; +} + +inline auto MetadataGroup::MatchEmpty() const -> MatchIndex { + MatchIndex portable_result; + MatchIndex simd_result; + if constexpr (!UseSIMD || DebugSIMD) { + portable_result = PortableMatchEmpty(); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_result = SIMDMatchEmpty(); + CARBON_DCHECK(simd_result == portable_result) + << "SIMD result '" << simd_result << "' doesn't match portable result '" + << portable_result << "'"; + } + return UseSIMD ? simd_result : portable_result; +} + +inline auto MetadataGroup::MatchDeleted() const -> MatchIndex { + MatchIndex portable_result; + MatchIndex simd_result; + if constexpr (!UseSIMD || DebugSIMD) { + portable_result = PortableMatchDeleted(); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_result = SIMDMatchDeleted(); + CARBON_DCHECK(simd_result == portable_result) + << "SIMD result '" << simd_result << "' doesn't match portable result '" + << portable_result << "'"; + } + return UseSIMD ? simd_result : portable_result; +} + +inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs) + -> bool { + bool portable_result; + bool simd_result; + if constexpr (!UseSIMD || DebugSIMD) { + portable_result = PortableCompareEqual(lhs, rhs); + } + if constexpr (UseSIMD || DebugSIMD) { + simd_result = SIMDCompareEqual(lhs, rhs); + CARBON_DCHECK(simd_result == portable_result); + } + return UseSIMD ? simd_result : portable_result; +} + +inline auto MetadataGroup::VerifyIndexBits( + MatchBitsT index_bits, + llvm::function_refbool> byte_match) const -> bool { + for (ssize_t byte_index : llvm::seq(0, Size)) { + if constexpr (!ByteEncoding) { + if (byte_match(metadata_bytes[byte_index])) { + CARBON_CHECK(((index_bits >> byte_index) & 1) == 1) + << "Bit not set at matching byte index: " << byte_index; + // Only the first match is needed, so stop scanning once found. + break; + } + + CARBON_CHECK(((index_bits >> byte_index) & 1) == 0) + << "Bit set at non-matching byte index: " << byte_index; + } else { + // `index_bits` is byte-encoded rather than bit encoded, so extract a + // byte. + uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF; + if (byte_match(metadata_bytes[byte_index])) { + CARBON_CHECK((index_byte & 0x80) == 0x80) + << "Should have the high bit set for a matching byte, found: " + << llvm::formatv("{0:x}", index_byte); + // Only the first match is needed so stop scanning once found. + break; + } + + CARBON_CHECK(index_byte == 0) + << "Should have no bits set for an unmatched byte, found: " + << llvm::formatv("{0:x}", index_byte); + } + } + return true; +} + +inline auto MetadataGroup::VerifyRangeBits( + MatchBitsT range_bits, + llvm::function_refbool> byte_match) const -> bool { + for (ssize_t byte_index : llvm::seq(0, Size)) { + if constexpr (!ByteEncoding) { + if (byte_match(metadata_bytes[byte_index])) { + CARBON_CHECK(((range_bits >> byte_index) & 1) == 1) + << "Bit not set at matching byte index: " << byte_index; + } else { + CARBON_CHECK(((range_bits >> byte_index) & 1) == 0) + << "Bit set at non-matching byte index: " << byte_index; + } + } else { + // `range_bits` is byte-encoded rather than bit encoded, so extract a + // byte. + uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF; + if (byte_match(metadata_bytes[byte_index])) { + CARBON_CHECK(range_byte == 0x80) + << "Should just have the high bit set for a matching byte, found: " + << llvm::formatv("{0:x}", range_byte); + } else { + CARBON_CHECK(range_byte == 0) + << "Should have no bits set for an unmatched byte, found: " + << llvm::formatv("{0:x}", range_byte); + } + } + } + return true; +} + +inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index) + -> MetadataGroup { + MetadataGroup g; + static_assert(sizeof(g) == Size); + std::memcpy(&g.metadata_bytes, metadata + index, Size); + return g; +} + +inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const + -> void { + std::memcpy(metadata + index, &metadata_bytes, Size); +} + +inline auto MetadataGroup::PortableClearDeleted() -> void { + for (uint64_t& metadata_int : metadata_ints) { + // Deleted bytes have only the least significant bits set, so to clear them + // we only need to clear the least significant bit. And empty bytes already + // have a clear least significant bit, so the only least significant bits we + // need to preserve are those of present bytes. The most significant bit of + // every present byte is set, so we take the most significant bit of each + // byte, shift it into the least significant bit position, and bit-or it + // with the compliment of `LSBs`. This will have ones for every bit but the + // least significant bits, and ones for the least significant bits of every + // present byte. + metadata_int &= (~LSBs | metadata_int >> 7); + } +} + +inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange { + // The caller should provide us with the present byte hash, and not set any + // present bit tag on it so that this layer can manage tagging the high bit of + // a present byte. + CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag); + + // Use a simple fallback approach for sizes beyond 8. + // TODO: Instead of a simple fallback, we should generalize the below + // algorithm for sizes above 8, even if to just exercise the same code on + // more platforms. + if constexpr (Size > 8) { + static_assert(Size <= 32, "Sizes larger than 32 not yet supported!"); + uint32_t match_bits = 0; + uint32_t bit = 1; + uint8_t present_byte = tag | PresentMask; + for (ssize_t i : llvm::seq(0, Size)) { + if (metadata_bytes[i] == present_byte) { + match_bits |= bit; + } + bit <<= 1; + } + return MatchRange(match_bits); + } + + // This algorithm only works for matching *present* bytes. We leverage the + // set high bit in the present case as part of the algorithm. The whole + // algorithm has a critical path height of 4 operations, and does 6 + // operations total on AArch64. The operation dependency graph is: + // + // group | MSBs LSBs * match_byte + MSBs + // \ / + // match_bits ^ broadcast + // | + // group & MSBs MSBs - match_bits + // \ / + // group_MSBs & match_bits + // + // This diagram and the operation count are specific to AArch64 where we have + // a fused *integer* multiply-add operation. + // + // While it is superficially similar to the "find zero bytes in a word" bit + // math trick, it is different because this is designed to have no false + // positives and perfectly produce 0x80 for matching bytes and 0x00 for + // non-matching bytes. This is do-able because we constrain to only handle + // present matches which only require testing 7 bits and have a particular + // layout. + + // Set the high bit of every byte to `1`. Any matching byte is a present byte + // and so always has this bit set as well, which means the xor below, in + // addition to zeroing the low 7 bits of any byte that matches the tag, also + // clears the high bit of every byte. + uint64_t match_bits = metadata_ints[0] | MSBs; + // Broadcast the match byte to all bytes, and mask in the present bits in the + // MSBs of each byte. We structure this as a multiply and an add because we + // know that the add cannot carry, and this way it can be lowered using + // combined multiply-add instructions if available. + uint64_t broadcast = LSBs * tag + MSBs; + CARBON_DCHECK(broadcast == (LSBs * tag | MSBs)) + << "Unexpected carry from addition!"; + + // Xor the broadcast byte pattern. This makes bytes with matches become 0, and + // clears the high-bits of non-matches. Note that if we are looking for a tag + // with the same value as `Empty` or `Deleted`, those bytes will be zero as + // well. + match_bits = match_bits ^ broadcast; + // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high + // bit will be set only for those bytes that were zero. + match_bits = MSBs - match_bits; + // Zero everything but the high bits, and also zero the high bits of any bytes + // for "not present" slots in the original group. This avoids false positives + // for `Empty` and `Deleted` bytes in the metadata. + match_bits &= (metadata_ints[0] & MSBs); + + // At this point, `match_bits` has the high bit set for bytes where the + // original group byte equals `tag` plus the high bit. + CARBON_DCHECK(VerifyRangeBits( + match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); })); + return MatchRange(match_bits); +} + +inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange { + // Use a simple fallback approach for sizes beyond 8. + // TODO: Instead of a simple fallback, we should generalize the below + // algorithm for sizes above 8, even if to just exercise the same code on + // more platforms. + if constexpr (Size > 8) { + static_assert(Size <= 32, "Sizes larger than 32 not yet supported!"); + uint32_t match_bits = 0; + uint32_t bit = 1; + for (ssize_t i : llvm::seq(0, Size)) { + if (metadata_bytes[i] & PresentMask) { + match_bits |= bit; + } + bit <<= 1; + } + return MatchRange(match_bits); + } + + // Want to keep the high bit of each byte, which indicates whether that byte + // represents a present slot. + uint64_t match_bits = metadata_ints[0] & MSBs; + + CARBON_DCHECK(VerifyRangeBits( + match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; })); + return MatchRange(match_bits); +} + +inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex { + // Use a simple fallback approach for sizes beyond 8. + // TODO: Instead of a simple fallback, we should generalize the below + // algorithm for sizes above 8, even if to just exercise the same code on + // more platforms. + if constexpr (Size > 8) { + static_assert(Size <= 32, "Sizes larger than 32 not yet supported!"); + uint32_t bit = 1; + for (ssize_t i : llvm::seq(0, Size)) { + if (metadata_bytes[i] == Empty) { + return MatchIndex(bit); + } + bit <<= 1; + } + return MatchIndex(0); + } + + // This sets the high bit of every byte in `match_bits` unless the + // corresponding metadata byte is 0. We take advantage of the fact that + // the metadata bytes in are non-zero only if they are either: + // - present: in which case the high bit of the byte will already be set; or + // - deleted: in which case the byte will be 1, and shifting it left by 7 will + // cause the high bit to be set. + uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7); + // This inverts the high bits of the bytes, and clears the remaining bits. + match_bits = ~match_bits & MSBs; + + // The high bits of the bytes of `match_bits` are set if the corresponding + // metadata byte is `Empty`. + CARBON_DCHECK( + VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; })); + return MatchIndex(match_bits); +} + +inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex { + // Use a simple fallback approach for sizes beyond 8. + // TODO: Instead of a simple fallback, we should generalize the below + // algorithm for sizes above 8, even if to just exercise the same code on + // more platforms. + if constexpr (Size > 8) { + static_assert(Size <= 32, "Sizes larger than 32 not yet supported!"); + uint32_t bit = 1; + for (ssize_t i : llvm::seq(0, Size)) { + if (metadata_bytes[i] == Deleted) { + return MatchIndex(bit); + } + bit <<= 1; + } + return MatchIndex(0); + } + + // This sets the high bit of every byte in `match_bits` unless the + // corresponding metadata byte is 1. We take advantage of the fact that the + // metadata bytes are not 1 only if they are either: + // - present: in which case the high bit of the byte will already be set; or + // - empty: in which case the byte will be 0, and in that case inverting and + // shifting left by 7 will have the high bit set. + uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7); + // This inverts the high bits of the bytes, and clears the remaining bits. + match_bits = ~match_bits & MSBs; + + // The high bits of the bytes of `match_bits` are set if the corresponding + // metadata byte is `Deleted`. + CARBON_DCHECK(VerifyIndexBits(match_bits, + [](uint8_t byte) { return byte == Deleted; })); + return MatchIndex(match_bits); +} + +inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs, + MetadataGroup rhs) -> bool { + return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes); +} + +inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index) + -> MetadataGroup { + MetadataGroup g; +#if CARBON_NEON_SIMD_SUPPORT + g.metadata_vec = vld1_u8(metadata + index); +#elif CARBON_X86_SIMD_SUPPORT + g.metadata_vec = + _mm_load_si128(reinterpret_cast(metadata + index)); +#else + static_assert(!UseSIMD, "Unimplemented SIMD operation"); + static_cast(metadata); + static_cast(index); +#endif + return g; +} + +inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const + -> void { +#if CARBON_NEON_SIMD_SUPPORT + vst1_u8(metadata + index, metadata_vec); +#elif CARBON_X86_SIMD_SUPPORT + _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec); +#else + static_assert(!UseSIMD, "Unimplemented SIMD operation"); + static_cast(metadata); + static_cast(index); +#endif +} + +inline auto MetadataGroup::SIMDClearDeleted() -> void { +#if CARBON_NEON_SIMD_SUPPORT + // There is no good Neon operation to implement this, so do it using integer + // code. This is reasonably fast, but unfortunate because it forces the group + // out of a SIMD register and into a general purpose register, which can have + // high latency. + metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7); +#elif CARBON_X86_SIMD_SUPPORT + // For each byte, use `metadata_vec` if the byte's high bit is set (indicating + // it is present), otherwise (it is empty or deleted) replace it with zero + // (representing empty). + metadata_vec = + _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec); +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); +#endif +} + +inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> MatchRange { + MatchRange result; +#if CARBON_NEON_SIMD_SUPPORT + // Broadcast byte we want to match to every byte in the vector. + auto match_byte_vec = vdup_n_u8(tag | PresentMask); + // Result bytes have all bits set for the bytes that match, so we have to + // clear everything but MSBs next. + auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec); + uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0]; + // The matched range is likely to be tested for zero by the caller, and that + // test can often be folded into masking the bits with `MSBs` when we do that + // mask in the scalar domain rather than the SIMD domain. So we do the mask + // here rather than above prior to extracting the match bits. + result = MatchRange(match_bits & MSBs); +#elif CARBON_X86_SIMD_SUPPORT + result = X86SIMDMatch(tag | PresentMask); +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); + static_cast(tag); +#endif + return result; +} + +inline auto MetadataGroup::SIMDMatchPresent() const -> MatchRange { + MatchRange result; +#if CARBON_NEON_SIMD_SUPPORT + // Just extract the metadata directly. + uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0]; + // The matched range is likely to be tested for zero by the caller, and that + // test can often be folded into masking the bits with `MSBs` when we do that + // mask in the scalar domain rather than the SIMD domain. So we do the mask + // here rather than above prior to extracting the match bits. + result = MatchRange(match_bits & MSBs); +#elif CARBON_X86_SIMD_SUPPORT + // We arranged the byte vector so that present bytes have the high bit set, + // which this instruction extracts. + result = MatchRange(_mm_movemask_epi8(metadata_vec)); +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); +#endif + return result; +} + +inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex { + MatchIndex result; +#if CARBON_NEON_SIMD_SUPPORT + // Compare all bytes with zero, as that is the empty byte value. Result will + // have all bits set for any input zero byte, so we zero all but the high bits + // below. + auto cmp_vec = vceqz_u8(metadata_vec); + uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0]; + // The matched range is likely to be tested for zero by the caller, and that + // test can often be folded into masking the bits with `MSBs` when we do that + // mask in the scalar domain rather than the SIMD domain. So we do the mask + // here rather than above prior to extracting the match bits. + result = MatchIndex(metadata_bits & MSBs); +#elif CARBON_X86_SIMD_SUPPORT + // Even though we only need the first match rather than all matches, we don't + // have a more efficient way to compute this on x86 and so we reuse the + // general match infrastructure that computes all matches in a bit-encoding. + // We then convert it into a `MatchIndex` that just finds the first one. + result = static_cast(X86SIMDMatch(Empty)); +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); +#endif + return result; +} + +inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex { + MatchIndex result; +#if CARBON_NEON_SIMD_SUPPORT + // Broadcast the `Deleted` byte across the vector and compare the bytes of + // that with the metadata vector. The result will have all bits set for any + // input zero byte, so we zero all but the high bits below. + auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted)); + uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0]; + // The matched range is likely to be tested for zero by the caller, and that + // test can often be folded into masking the bits with `MSBs` when we do that + // mask in the scalar domain rather than the SIMD domain. So we do the mask + // here rather than above prior to extracting the match bits. + result = MatchIndex(match_bits & MSBs); +#elif CARBON_X86_SIMD_SUPPORT + // Even though we only need the first match rather than all matches, we don't + // have a more efficient way to compute this on x86 and so we reuse the + // general match infrastructure that computes all matches in a bit-encoding. + // We then convert it into a `MatchIndex` that just finds the first one. + result = static_cast(X86SIMDMatch(Deleted)); +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); +#endif + return result; +} + +inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs, + MetadataGroup rhs) -> bool { +#if CARBON_NEON_SIMD_SUPPORT + return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] == + static_cast(-1LL); +#elif CARBON_X86_SIMD_SUPPORT + // Different x86 SIMD extensions provide different comparison functionality + // available. +#if __SSE4_2__ + // With SSE 4.2, we can directly test and branch in the SIMD domain on whether + // the two metadata vectors are equal. + return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec), + _mm_set1_epi8(0xff)) == 1; +#else + // With older versions of SSE we have to extract the result of the comparison, + // much like we do when matching. That will have the usual bitmask + // representing equal bytes, and test for that exact bitmask in scalar code. + return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec, + rhs.metadata_vec)) == 0x0000'ffffU; +#endif +#else + static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation"); + static_cast(lhs); + static_cast(rhs); + return false; +#endif +} + +#if CARBON_X86_SIMD_SUPPORT +inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const + -> MatchRange { + // Broadcast the byte we're matching against to all bytes in a vector, and + // compare those bytes with the metadata vector bytes. + auto match_byte_vec = _mm_set1_epi8(match_byte); + auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec); + // Extract the result of each byte-wise comparison into the low bits of an + // integer. + uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec); + return MatchRange(match_bits); +} +#endif + +} // namespace Carbon::RawHashtable + +#endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_ diff --git a/common/raw_hashtable_metadata_group_benchmark.cpp b/common/raw_hashtable_metadata_group_benchmark.cpp new file mode 100644 index 0000000000000..62f85f53cfde9 --- /dev/null +++ b/common/raw_hashtable_metadata_group_benchmark.cpp @@ -0,0 +1,328 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include + +#include "absl/random/random.h" +#include "common/raw_hashtable_metadata_group.h" + +namespace Carbon::RawHashtable { + +// If we have any SIMD support, create dedicated benchmark utilities for the +// portable and SIMD implementation so we can directly benchmark both. +#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT +// Override the core API with explicit use of the portable API. +class BenchmarkPortableMetadataGroup : public MetadataGroup { + public: + explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {} + + static auto Load(uint8_t* metadata, ssize_t index) + -> BenchmarkPortableMetadataGroup { + return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index)); + } + auto Store(uint8_t* metadata, ssize_t index) const -> void { + PortableStore(metadata, index); + } + + auto ClearDeleted() -> void { PortableClearDeleted(); } + + auto Match(uint8_t present_byte) const -> MatchRange { + return PortableMatch(present_byte); + } + auto MatchPresent() const -> MatchRange { return PortableMatchPresent(); } + + auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); } + auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); } +}; + +// Override the core API with explicit use of the SIMD API. +class BenchmarkSIMDMetadataGroup : public MetadataGroup { + public: + explicit BenchmarkSIMDMetadataGroup(MetadataGroup g) : MetadataGroup(g) {} + + static auto Load(uint8_t* metadata, ssize_t index) + -> BenchmarkSIMDMetadataGroup { + return BenchmarkSIMDMetadataGroup(SIMDLoad(metadata, index)); + } + auto Store(uint8_t* metadata, ssize_t index) const -> void { + SIMDStore(metadata, index); + } + + auto ClearDeleted() -> void { SIMDClearDeleted(); } + + auto Match(uint8_t present_byte) const -> MatchRange { + return SIMDMatch(present_byte); + } + auto MatchPresent() const -> MatchRange { return SIMDMatchPresent(); } + + auto MatchEmpty() const -> MatchIndex { return SIMDMatchEmpty(); } + auto MatchDeleted() const -> MatchIndex { return SIMDMatchDeleted(); } +}; +#endif + +namespace { + +// The number of metadata groups we use when benchmarking a particular scenario +// of matching within a group. +constexpr ssize_t BenchSize = 256; + +#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT +using PortableGroup = BenchmarkPortableMetadataGroup; +using SIMDGroup = BenchmarkSIMDMetadataGroup; +#endif + +struct BenchMetadata { + // The metadata for benchmarking, arranged in `BenchSize` groups, each one + // `GroupSize` in length. As a consequence, the size of this array will always + // be `BenchSize * GroupSize`. + llvm::MutableArrayRef metadata; + + // For benchmarking random matches in the metadata, each byte here is the tag + // that should be matched against the corresponding group of the metadata. + // Because this array parallels the *groups* of the metadata array, its size + // will be `BenchSize`. For other kinds, this is empty. + llvm::ArrayRef bytes; +}; + +enum class BenchKind : uint8_t { + Random, + Empty, + Deleted, +}; + +// This routine should only be called once per `BenchKind` as the initializer of +// a global variable below. It returns an `ArrayRef` pointing into +// function-local static storage that provides our benchmark metadata. +// +// The returned array will have exactly `GroupSize` elements, each of +// `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1` +// matches of that kind within each group of the metadata. This lets us +// benchmark each of the possible match-counts for a group. +template +static auto BuildBenchMetadata() -> llvm::ArrayRef { + // We build `GroupSize` elements of `BenchMetadata` below, and so we need + // `GroupSize` copies of each of these arrays to serve as inputs to it. + // + // The first storage is of `BenchSize` groups of metadata. + static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize]; + // When `Kind` is `Random`, each group above will have a *different* byte that + // matches in that group. This array stores those bytes for the benchmark to + // match against the group. + static uint8_t bytes_storage[GroupSize][BenchSize]; + + // The backing storage for the returned `ArrayRef`. + static BenchMetadata bm_storage[GroupSize]; + + absl::BitGen gen; + for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) { + int match_count = bm_index + 1; + + for (ssize_t g_index : llvm::seq(0, BenchSize)) { + // Start by filling the group with random bytes. + auto group_bytes = llvm::MutableArrayRef( + &metadata_storage[bm_index][g_index * GroupSize], GroupSize); + for (uint8_t& b : group_bytes) { + b = absl::Uniform(gen) | MetadataGroup::PresentMask; + } + + // Now we need up to `match_count` random indices into the group where + // we'll put a matching byte. + std::array group_indices; + std::iota(group_indices.begin(), group_indices.end(), 0); + std::shuffle(group_indices.begin(), group_indices.end(), gen); + + // Now cause the first match index to have the desired value. + ssize_t match_index = *group_indices.begin(); + uint8_t& match_b = group_bytes[match_index]; + switch (Kind) { + case BenchKind::Random: { + // Already a random value, but we need to ensure it isn't one that + // repeats elsewhere in the group. + while (llvm::count(group_bytes, match_b) > 1) { + match_b = absl::Uniform(gen) | MetadataGroup::PresentMask; + } + // Store this as the byte to search for in this group, but without the + // present bit to simulate where we start when using a 7-bit tag + // from a hash. + bytes_storage[bm_index][g_index] = + match_b & ~MetadataGroup::PresentMask; + break; + } + case BenchKind::Empty: { + match_b = MetadataGroup::Empty; + break; + } + case BenchKind::Deleted: { + match_b = MetadataGroup::Deleted; + break; + } + } + + // Replicate the match byte in each of the other matching indices. + for (ssize_t m_index : llvm::ArrayRef(group_indices) + .drop_front() + .take_front(match_count - 1)) { + group_bytes[m_index] = match_b; + } + } + + // Now that the storage is set up, record these in our struct. + bm.metadata = metadata_storage[bm_index]; + if constexpr (Kind == BenchKind::Random) { + bm.bytes = bytes_storage[bm_index]; + } + } + return bm_storage; +} + +template +// NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug. +const auto bench_metadata = BuildBenchMetadata(); + +// Benchmark that simulates the dynamic execution pattern when we match exactly +// one entry in the group, typically then using the index of the matching byte +// to index into an element of a group of entries. But notably, the *first* +// match is sufficient, and we never have to find the *next* match within the +// group. +template +static void BM_LoadMatch(benchmark::State& s) { + BenchMetadata bm = bench_metadata[0]; + + // We want to make the index used by the next iteration of the benchmark have + // a data dependency on the result of matching. A match produces an index into + // the group of metadata. To consume this match in a way that is + // representative of how it will be used in a hashtable (indexing into an + // array of entries), while establishing that dependence, we keep a + // group-sized array of the value `1` in memory that we can index into to + // increment to the next step of the loop. We do have to hide the contents of + // the loop from the optimizer by clobbering the memory. + ssize_t all_ones[GroupSize]; + for (ssize_t& n : all_ones) { + n = 1; + } + benchmark::ClobberMemory(); + + // We don't want the optimizer to peel iterations off of this loop, so hide + // the starting index. + ssize_t i = 0; + benchmark::DoNotOptimize(i); + + // This loop looks *really* attractive to unroll to the compiler. However, + // that can easily overlap some of the memory operations and generally makes + // it harder to analyze the exact operation sequence we care about. +#pragma clang loop unroll(disable) + for (auto _ : s) { + auto g = GroupT::Load(bm.metadata.data(), i * GroupSize); + typename GroupT::MatchIndex matches; + if constexpr (Kind == BenchKind::Empty) { + matches = g.MatchEmpty(); + } else if constexpr (Kind == BenchKind::Deleted) { + matches = g.MatchDeleted(); + } else { + static_assert(Kind == BenchKind::Random); + matches = static_cast(g.Match(bm.bytes[i])); + } + // Despite not being a DCHECK, this is fine for benchmarking. In an actual + // hashtable, we expect to have a test for empty of the match prior to using + // it to index an array, and that test is expected to be strongly predicted. + // That exactly matches how the `CARBON_CHECK` macro works, and so this + // serves as both a good correctness test and replication of hashtable usage + // of a match. + CARBON_CHECK(matches); + + // Now do the data-dependent increment by indexing our "all ones" array. The + // index into `all_ones` is analogous to the index into a group of hashtable + // entries. + i = (i + all_ones[matches.index()]) & (BenchSize - 1); + } +} +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +BENCHMARK(BM_LoadMatch); +#endif + +// Benchmark that measures the speed of a match that is only found after at +// least one miss. Because the first match doesn't work, this covers +// incrementing to the next match, with a number of increments taken from the +// `Step` template parameter. +template +static void BM_LoadMatchMissSteps(benchmark::State& s) { + static_assert(Steps > 0); + static_assert(Steps <= GroupSize); + + // We pick the benchmark metadata at index `Steps - 1`, which will have + // `Steps` matches within each group. + BenchMetadata bm = bench_metadata[Steps - 1]; + + // We want to make the index used by the next iteration of the benchmark have + // a data dependency on the result of matching. A match produces an index into + // the group of metadata. To consume this match in a way that is + // representative of how it will be used in a hashtable (indexing into an + // array of entries), while establishing that dependence, we keep a + // group-sized array of the value `1` in memory that we can index into to + // increment to the next step of the loop. We do have to hide the contents of + // the loop from the optimizer by clobbering the memory. + ssize_t all_ones[GroupSize]; + for (ssize_t& n : all_ones) { + n = 1; + } + benchmark::ClobberMemory(); + + // We don't want the optimizer to peel iterations off of this loop, so hide + // the starting index. + ssize_t i = 0; + benchmark::DoNotOptimize(i); + + // This loop looks *really* attractive to unroll to the compiler. However, + // that can easily overlap some of the memory operations and generally makes + // it harder to analyze the exact operation sequence we care about. +#pragma clang loop unroll(disable) + for (auto _ : s) { + auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize); + auto matched_range = g.Match(bm.bytes[i]); + + // We don't use a `CARBON_CHECK` here as the loop below will test the range + // to see if the loop should be skipped, replicating the test that we also + // expect in hashtable usage. + + // We want to simulate the code sequence a hashtable would produce when + // matching indices are "misses" in the hashtable, but only the aspects of + // those that reflect on the specific *match* implementation's generated + // code and performance. For each index in the match, we locate it in the + // `matched_range`, extract it as an index, and use that to index a + // group-sized array. We read memory from that array to increment `indices`, + // establishing data dependencies on each match index. This loop will run + // exactly `Steps` times. + ssize_t indices = 0; + for (ssize_t index : matched_range) { + indices += all_ones[index]; + } + + // We want to propagate the data dependencies accumulated into `indices` + // into the next value of `i`, and we know exactly how many increments were + // done in the loop, so subtract that constant and add one to arrive back at + // an increment of 1. + i = (i + (indices - Steps + 1)) & (BenchSize - 1); + } +} +BENCHMARK(BM_LoadMatchMissSteps); +BENCHMARK(BM_LoadMatchMissSteps); +BENCHMARK(BM_LoadMatchMissSteps); +BENCHMARK(BM_LoadMatchMissSteps); +#if CARBON_USE_X86_SIMD_CONTROL_GROUP +BENCHMARK(BM_LoadMatchMissSteps); +BENCHMARK(BM_LoadMatchMissSteps); +#endif + +} // namespace +} // namespace Carbon::RawHashtable diff --git a/common/raw_hashtable_metadata_group_benchmark_test.sh b/common/raw_hashtable_metadata_group_benchmark_test.sh new file mode 100755 index 0000000000000..bff7a1ab1c7cf --- /dev/null +++ b/common/raw_hashtable_metadata_group_benchmark_test.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# +# Part of the Carbon Language project, under the Apache License v2.0 with LLVM +# Exceptions. See /LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/raw_hashtable_metadata_group_benchmark" + +exec "$BENCHMARK" \ + --benchmark_counters_tabular=true \ + --benchmark_min_time=1x diff --git a/common/raw_hashtable_test_helpers.h b/common/raw_hashtable_test_helpers.h new file mode 100644 index 0000000000000..cf169ddf44476 --- /dev/null +++ b/common/raw_hashtable_test_helpers.h @@ -0,0 +1,115 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_ +#define CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_ + +#include + +#include "common/check.h" +#include "common/hashing.h" +#include "common/hashtable_key_context.h" +#include "common/ostream.h" + +namespace Carbon::RawHashtable { + +// Non-trivial type for testing. +struct TestData : Printable { + int value; + + // NOLINTNEXTLINE: google-explicit-constructor + TestData(int v) : value(v) { CARBON_CHECK(value >= 0); } + ~TestData() { + CARBON_CHECK(value >= 0); + value = -1; + } + TestData(const TestData& other) : TestData(other.value) {} + TestData(TestData&& other) noexcept : TestData(other.value) { + other.value = 0; + } + auto Print(llvm::raw_ostream& out) const -> void { out << value; } + + friend auto operator==(TestData lhs, TestData rhs) -> bool { + return lhs.value == rhs.value; + } + + friend auto operator<=>(TestData lhs, TestData rhs) -> std::strong_ordering { + return lhs.value <=> rhs.value; + } + + friend auto CarbonHashValue(TestData data, uint64_t seed) -> HashCode { + return Carbon::HashValue(data.value, seed); + } +}; + +// Test stateless key context that produces different hashes from normal. +// Changing the hash values should result in test failures if the context ever +// fails to be used. +struct TestKeyContext : DefaultKeyContext { + template + auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode { + Hasher hash(seed); + // Inject some other data to the hash. + hash.Hash(42); + hash.Hash(HashValue(key)); + return static_cast(hash); + } +}; + +// Hostile fixed hashing key context used for stress testing. Allows control +// over which parts of the hash will be forced to collide, and the values they +// are coerced to. Note that this relies on implementation details and internals +// of `HashCode`. +template +struct FixedHashKeyContext : DefaultKeyContext { + template + auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode { + HashCode original_hash = HashValue(key, seed); + auto raw_hash = static_cast(original_hash); + + constexpr uint64_t TagMask = (1U << TagBits) - 1; + if (FixIndexBits) { + raw_hash &= TagMask; + raw_hash |= FixedVal << TagBits; + CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag().first == + (FixedVal & (~static_cast(0) >> TagBits))); + } + if (FixTagBits) { + raw_hash &= ~TagMask; + raw_hash |= FixedVal & TagMask; + CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag().second == + (FixedVal & TagMask)); + } + return HashCode(raw_hash); + } +}; + +template +class IndexKeyContext { + public: + explicit IndexKeyContext(llvm::ArrayRef array) : array_(array) {} + + auto HashKey(const T& value, uint64_t seed) const -> HashCode { + return HashValue(value, seed); + } + auto HashKey(ssize_t index, uint64_t seed) const -> HashCode { + return HashKey(array_[index], seed); + } + + auto KeyEq(const T& lhs, ssize_t rhs_index) const -> bool { + return lhs == array_[rhs_index]; + } + auto KeyEq(ssize_t lhs_index, ssize_t rhs_index) const -> bool { + // No need to compare the elements, if the indices are equal, the values + // must be. + return lhs_index == rhs_index; + } + + private: + llvm::ArrayRef array_; +}; + +} // namespace Carbon::RawHashtable + +#endif // CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_ diff --git a/common/set.h b/common/set.h new file mode 100644 index 0000000000000..7a34e1919a95d --- /dev/null +++ b/common/set.h @@ -0,0 +1,358 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef CARBON_COMMON_SET_H_ +#define CARBON_COMMON_SET_H_ + +#include + +#include "common/check.h" +#include "common/hashtable_key_context.h" +#include "common/raw_hashtable.h" +#include "llvm/Support/Compiler.h" + +namespace Carbon { + +// Forward declarations to resolve cyclic references. +template +class SetView; +template +class SetBase; +template +class Set; + +// A read-only view type for a set of keys. +// +// This view is a cheap-to-copy type that should be passed by value, but +// provides view or read-only reference semantics to the underlying set data +// structure. +// +// This should always be preferred to a `const`-ref parameter for the `SetBase` +// or `Set` type as it provides more flexibility and a cleaner API. +// +// Note that while this type is a read-only view, that applies to the underlying +// *set* data structure, not the individual entries stored within it. Those can +// be mutated freely as long as both the hashes and equality of the keys are +// preserved. If we applied a deep-`const` design here, it would prevent using +// this type in situations where the keys carry state (unhashed and not part of +// equality) that is mutated while the associative container is not. A view of +// immutable data can always be obtained by using `SetView`, and we +// enable conversions to more-const views. This mirrors the semantics of views +// like `std::span`. +// +// A specific `KeyContextT` type can optionally be provided to configure how +// keys will be hashed and compared. The default is `DefaultKeyContext` which is +// stateless and will hash using `Carbon::HashValue` and compare using +// `operator==`. Every method accepting a lookup key or operating on the keys in +// the table will also accept an instance of this type. For stateless context +// types, including the default, an instance will be default constructed if not +// provided to these methods. However, stateful contexts should be constructed +// and passed in explicitly. The context type should be small and reasonable to +// pass by value, often a wrapper or pointer to the relevant context needed for +// hashing and comparing keys. For more details about the key context, see +// `hashtable_key_context.h`. +template +class SetView : RawHashtable::ViewImpl { + using ImplT = RawHashtable::ViewImpl; + + public: + using KeyT = typename ImplT::KeyT; + using KeyContextT = typename ImplT::KeyContextT; + + // This type represents the result of lookup operations. It encodes whether + // the lookup was a success as well as accessors for the key. + class LookupResult { + public: + LookupResult() = default; + explicit LookupResult(KeyT& key) : key_(&key) {} + + explicit operator bool() const { return key_ != nullptr; } + + auto key() const -> KeyT& { return *key_; } + + private: + KeyT* key_ = nullptr; + }; + + // Enable implicit conversions that add `const`-ness to the key type. + // NOLINTNEXTLINE(google-explicit-constructor) + SetView(SetView, KeyContextT> other_view) + requires(!std::same_as>) + : ImplT(other_view) {} + + // Tests whether a key is present in the set. + template + auto Contains(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> bool; + + // Lookup a key in the set. + template + auto Lookup(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> LookupResult; + + // Run the provided callback for every key in the set. + template + void ForEach(CallbackT callback) + requires(std::invocable); + + // This routine is relatively inefficient and only intended for use in + // benchmarking or logging of performance anomalies. The specific count + // returned has no specific guarantees beyond being informative in benchmarks. + // It counts how many of the keys in the hashtable have required probing + // beyond their initial group of slots. + // + // TODO: Replace with a more general metrics routine that covers other + // important aspects such as load factor, and average probe *distance*. + auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t { + return ImplT::CountProbedKeys(key_context); + } + + private: + template + friend class Set; + friend class SetBase; + friend class SetView; + + using EntryT = typename ImplT::EntryT; + + SetView() = default; + // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design. + SetView(ImplT base) : ImplT(base) {} + SetView(ssize_t size, RawHashtable::Storage* storage) + : ImplT(size, storage) {} +}; + +// A base class for a `Set` type that remains mutable while type-erasing the +// `SmallSize` (SSO) template parameter. +// +// A pointer or reference to this type is the preferred way to pass a mutable +// handle to a `Set` type across API boundaries as it avoids encoding specific +// SSO sizing information while providing a near-complete mutable API. +template +class SetBase + : protected RawHashtable::BaseImpl { + protected: + using ImplT = RawHashtable::BaseImpl; + + public: + using KeyT = typename ImplT::KeyT; + using KeyContextT = typename ImplT::KeyContextT; + using ViewT = SetView; + using LookupResult = typename ViewT::LookupResult; + + // The result type for insertion operations both indicates whether an insert + // was needed (as opposed to the key already being in the set), and provides + // access to the key. + class InsertResult { + public: + InsertResult() = default; + explicit InsertResult(bool inserted, KeyT& key) + : key_(&key), inserted_(inserted) {} + + auto is_inserted() const -> bool { return inserted_; } + + auto key() const -> KeyT& { return *key_; } + + private: + KeyT* key_; + bool inserted_; + }; + + // Implicitly convertible to the relevant view type. + // + // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay. + operator ViewT() const { return this->view_impl(); } + + // We can't chain the above conversion with the conversions on `ViewT` to add + // const, so explicitly support adding const to produce a view here. + // + // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay. + operator SetView() const { return ViewT(*this); } + + // Convenience forwarder to the view type. + template + auto Contains(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> bool { + return ViewT(*this).Contains(lookup_key, key_context); + } + + // Convenience forwarder to the view type. + template + auto Lookup(LookupKeyT lookup_key, + KeyContextT key_context = KeyContextT()) const -> LookupResult { + return ViewT(*this).Lookup(lookup_key, key_context); + } + + // Convenience forwarder to the view type. + template + void ForEach(CallbackT callback) + requires(std::invocable) + { + return ViewT(*this).ForEach(callback); + } + + // Convenience forwarder to the view type. + auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const + -> ssize_t { + return ViewT(*this).CountProbedKeys(key_context); + } + + // Insert a key into the set. If the key is already present, no insertion is + // performed and that present key is available in the result. Otherwise a new + // key is inserted and constructed from the argument and available in the + // result. + template + auto Insert(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT()) + -> InsertResult; + + // Insert a key into the set and call the provided callback to allow in-place + // construction of the key if not already present. The lookup key is passed + // through to the callback so it needn't be captured and can be kept in a + // register argument throughout. + // + // Example: + // ```cpp + // m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage) { + // new (key_storage) MyStringType(lookup_key); + // }); + // ``` + template + auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb, + KeyContextT key_context = KeyContextT()) -> InsertResult + requires std::invocable; + + // Erase a key from the set. + template + auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT()) + -> bool; + + // Clear all key/value pairs from the set but leave the underlying hashtable + // allocated and in place. + void Clear(); + + protected: + using ImplT::ImplT; +}; + +// A data structure for a set of keys. +// +// This set supports small size optimization (or "SSO"). The provided +// `SmallSize` type parameter indicates the size of an embedded buffer for +// storing sets small enough to fit. The default is zero, which always allocates +// a heap buffer on construction. When non-zero, must be a multiple of the +// `MaxGroupSize` which is currently 16. The library will check that the size is +// valid and provide an error at compile time if not. We don't automatically +// select the next multiple or otherwise fit the size to the constraints to make +// it clear in the code how much memory is used by the SSO buffer. +// +// This data structure optimizes heavily for small key types that are cheap to +// move and even copy. Using types with large keys or expensive to copy keys may +// create surprising performance bottlenecks. A `std::string` key should be fine +// with generally small strings, but if some or many strings are large heap +// allocations the performance of hashtable routines may be unacceptably bad and +// another data structure or key design is likely preferable. +// +// Note that this type should typically not appear on API boundaries; either +// `SetBase` or `SetView` should be used instead. +template +class Set : public RawHashtable::TableImpl, + SmallSize> { + using BaseT = SetBase; + using ImplT = RawHashtable::TableImpl; + + public: + using KeyT = typename BaseT::KeyT; + + Set() = default; + Set(const Set& arg) = default; + Set(Set&& arg) noexcept = default; + + // Reset the entire state of the hashtable to as it was when constructed, + // throwing away any intervening allocations. + void Reset(); +}; + +template +template +auto SetView::Contains( + LookupKeyT lookup_key, KeyContextT key_context) const -> bool { + return this->LookupEntry(lookup_key, key_context) != nullptr; +} + +template +template +auto SetView::Lookup(LookupKeyT lookup_key, + KeyContextT key_context) const + -> LookupResult { + EntryT* entry = this->LookupEntry(lookup_key, key_context); + if (!entry) { + return LookupResult(); + } + + return LookupResult(entry->key()); +} + +template +template +void SetView::ForEach(CallbackT callback) + requires(std::invocable) +{ + this->ForEachEntry([callback](EntryT& entry) { callback(entry.key()); }, + [](auto...) {}); +} + +template +template +auto SetBase::Insert(LookupKeyT lookup_key, + KeyContextT key_context) + -> InsertResult { + return Insert( + lookup_key, + [](LookupKeyT lookup_key, void* key_storage) { + new (key_storage) KeyT(std::move(lookup_key)); + }, + key_context); +} + +template +template +auto SetBase::Insert(LookupKeyT lookup_key, + InsertCallbackT insert_cb, + KeyContextT key_context) + -> InsertResult + requires std::invocable +{ + auto [entry, inserted] = this->InsertImpl(lookup_key, key_context); + CARBON_DCHECK(entry) << "Should always result in a valid index."; + + if (LLVM_LIKELY(!inserted)) { + return InsertResult(false, entry->key()); + } + + insert_cb(lookup_key, static_cast(&entry->key_storage)); + return InsertResult(true, entry->key()); +} + +template +template +auto SetBase::Erase(LookupKeyT lookup_key, + KeyContextT key_context) + -> bool { + return this->EraseImpl(lookup_key, key_context); +} + +template +void SetBase::Clear() { + this->ClearImpl(); +} + +template +void Set::Reset() { + this->ResetImpl(); +} + +} // namespace Carbon + +#endif // CARBON_COMMON_SET_H_ diff --git a/common/set_benchmark.cpp b/common/set_benchmark.cpp new file mode 100644 index 0000000000000..048603ca53781 --- /dev/null +++ b/common/set_benchmark.cpp @@ -0,0 +1,382 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "absl/container/flat_hash_set.h" +#include "common/raw_hashtable_benchmark_helpers.h" +#include "common/set.h" +#include "llvm/ADT/DenseSet.h" + +namespace Carbon { +namespace { + +using RawHashtable::CarbonHashDI; +using RawHashtable::GetKeysAndHitKeys; +using RawHashtable::GetKeysAndMissKeys; +using RawHashtable::HitArgs; +using RawHashtable::SizeArgs; +using RawHashtable::ValueToBool; + +template +struct IsCarbonSetImpl : std::false_type {}; +template +struct IsCarbonSetImpl> : std::true_type {}; + +template +static constexpr bool IsCarbonSet = IsCarbonSetImpl::value; + +// A wrapper around various set types that we specialize to implement a common +// API used in the benchmarks for various different map data structures that +// support different APIs. The primary template assumes a roughly +// `std::unordered_set` API design, and types with a different API design are +// supported through specializations. +template +struct SetWrapperImpl { + using KeyT = typename SetT::key_type; + + SetT s; + + auto BenchContains(KeyT k) -> bool { return s.find(k) != s.end(); } + + auto BenchLookup(KeyT k) -> bool { + auto it = s.find(k); + if (it == s.end()) { + return false; + } + // We expect keys to always convert to `true` so directly return that here. + return ValueToBool(*it); + } + + auto BenchInsert(KeyT k) -> bool { + auto result = s.insert(k); + return result.second; + } + + auto BenchErase(KeyT k) -> bool { return s.erase(k) != 0; } +}; + +// Explicit (partial) specialization for the Carbon map type that uses its +// different API design. +template +struct SetWrapperImpl> { + using SetT = Set; + using KeyT = KT; + + SetT s; + + auto BenchContains(KeyT k) -> bool { return s.Contains(k); } + + auto BenchLookup(KeyT k) -> bool { + auto result = s.Lookup(k); + if (!result) { + return false; + } + return ValueToBool(result.key()); + } + + auto BenchInsert(KeyT k) -> bool { + auto result = s.Insert(k); + return result.is_inserted(); + } + + auto BenchErase(KeyT k) -> bool { return s.Erase(k); } +}; + +// Provide a way to override the Carbon Set specific benchmark runs with another +// hashtable implementation. When building, you can use one of these enum names +// in a macro define such as `-DCARBON_SET_BENCH_OVERRIDE=Name` in order to +// trigger a specific override for the `Set` type benchmarks. This is used to +// get before/after runs that compare the performance of Carbon's Set versus +// other implementations. +enum class SetOverride : uint8_t { + Abseil, + LLVM, + LLVMAndCarbonHash, +}; +template +struct SetWrapperOverride : SetWrapperImpl {}; + +template +struct SetWrapperOverride, SetOverride::Abseil> + : SetWrapperImpl> {}; + +template +struct SetWrapperOverride, SetOverride::LLVM> + : SetWrapperImpl> {}; + +template +struct SetWrapperOverride, + SetOverride::LLVMAndCarbonHash> + : SetWrapperImpl>> {}; + +#ifndef CARBON_SET_BENCH_OVERRIDE +template +using SetWrapper = SetWrapperImpl; +#else +template +using SetWrapper = + SetWrapperOverride; +#endif + +// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here. +#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT) \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>)->Apply(APPLY); \ + BENCHMARK(NAME>>)->Apply(APPLY) +// NOLINTEND(bugprone-macro-parentheses) + +#define MAP_BENCHMARK_ONE_OP(NAME, APPLY) \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int); \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*); \ + MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef) + +// Benchmark the "latency" of testing for a key in a set. This always tests with +// a key that is found. +// +// However, because the key is always found and because the test ultimately +// involves conditional control flow that can be predicted, we expect modern +// CPUs to perfectly predict the control flow here and turn the measurement from +// one iteration to the next into a throughput measurement rather than a real +// latency measurement. +// +// However, this does represent a particularly common way in which a set data +// structure is accessed. The numbers should just be carefully interpreted in +// the context of being more a reflection of reciprocal throughput than actual +// latency. See the `Lookup` benchmarks for a genuine latency measure with its +// own caveats. +// +// However, this does still show some interesting caching effects when querying +// large fractions of large tables, and can give a sense of the inescapable +// magnitude of these effects even when there is a great deal of prediction and +// speculative execution to hide memory access latency. +template +static void BM_SetContainsHitPtr(benchmark::State& state) { + using SetWrapperT = SetWrapper; + using KT = typename SetWrapperT::KeyT; + SetWrapperT s; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + s.BenchInsert(k); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + // We block optimizing `i` as that has proven both more effective at + // blocking the loop from being optimized away and avoiding disruption of + // the generated code that we're benchmarking. + benchmark::DoNotOptimize(i); + + bool result = s.BenchContains(lookup_keys[i]); + CARBON_DCHECK(result); + // We use the lookup success to step through keys, establishing a + // dependency between each lookup. This doesn't fully allow us to measure + // latency rather than throughput, as noted above. + i += static_cast(result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_SetContainsHitPtr, HitArgs); + +// Benchmark the "latency" (but more likely the reciprocal throughput, see +// comment above) of testing for a key in the set that is *not* present. +template +static void BM_SetContainsMissPtr(benchmark::State& state) { + using SetWrapperT = SetWrapper; + using KT = typename SetWrapperT::KeyT; + SetWrapperT s; + auto [keys, lookup_keys] = GetKeysAndMissKeys(state.range(0)); + for (auto k : keys) { + s.BenchInsert(k); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + benchmark::DoNotOptimize(i); + + bool result = s.BenchContains(lookup_keys[i]); + CARBON_DCHECK(!result); + i += static_cast(!result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_SetContainsMissPtr, SizeArgs); + +// A somewhat contrived latency test for the lookup code path. +// +// While lookups into a set are often (but not always) simply used to influence +// control flow, that style of access produces difficult to evaluate benchmark +// results (see the comments on the `Contains` benchmarks above). +// +// So here we actually access the key in the set and convert that key's value to +// a boolean on the critical path of each iteration. This lets us have a genuine +// latency benchmark of looking up a key in the set, at the expense of being +// somewhat contrived. That said, for usage where the key object is queried or +// operated on in some way once looked up in the set, this will be fairly +// representative of the latency cost from the data structure. +template +static void BM_SetLookupHitPtr(benchmark::State& state) { + using SetWrapperT = SetWrapper; + using KT = typename SetWrapperT::KeyT; + SetWrapperT s; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + s.BenchInsert(k); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + benchmark::DoNotOptimize(i); + + bool result = s.BenchLookup(lookup_keys[i]); + CARBON_DCHECK(result); + i += static_cast(result); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_SetLookupHitPtr, HitArgs); + +// First erase and then insert the key. The code path will always be the same +// here and so we expect this to largely be a throughput benchmark because of +// branch prediction and speculative execution. +// +// We don't expect erase followed by insertion to be a common user code +// sequence, but we don't have a good way of benchmarking either erase or insert +// in isolation -- each would change the size of the table and thus the next +// iteration's benchmark. And if we try to correct the table size outside of the +// timed region, we end up trying to exclude too fine grained of a region from +// timers to get good measurement data. +// +// Our solution is to benchmark both erase and insertion back to back. We can +// then get a good profile of the code sequence of each, and at least measure +// the sum cost of these reliably. Careful profiling can help attribute that +// cost between erase and insert in order to understand which of the two +// operations is contributing most to any performance artifacts observed. +template +static void BM_SetEraseInsertHitPtr(benchmark::State& state) { + using SetWrapperT = SetWrapper; + using KT = typename SetWrapperT::KeyT; + SetWrapperT s; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), state.range(1)); + for (auto k : keys) { + s.BenchInsert(k); + } + ssize_t lookup_keys_size = lookup_keys.size(); + + while (state.KeepRunningBatch(lookup_keys_size)) { + for (ssize_t i = 0; i < lookup_keys_size;) { + benchmark::DoNotOptimize(i); + + s.BenchErase(lookup_keys[i]); + benchmark::ClobberMemory(); + + bool inserted = s.BenchInsert(lookup_keys[i]); + CARBON_DCHECK(inserted); + i += static_cast(inserted); + } + } +} +MAP_BENCHMARK_ONE_OP(BM_SetEraseInsertHitPtr, HitArgs); + +// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here. +#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT) \ + BENCHMARK(NAME>)->Apply(SizeArgs); \ + BENCHMARK(NAME>)->Apply(SizeArgs); \ + BENCHMARK(NAME>)->Apply(SizeArgs); \ + BENCHMARK(NAME>>)->Apply(SizeArgs) +// NOLINTEND(bugprone-macro-parentheses) + +#define MAP_BENCHMARK_OP_SEQ(NAME) \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int); \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*); \ + MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef) + +// This is an interesting, somewhat specialized benchmark that measures the cost +// of inserting a sequence of keys into a set up to some size and then inserting +// a colliding key and throwing away the set. +// +// This is an especially important usage pattern for sets as a large number of +// algorithms essentially look like this, such as collision detection, cycle +// detection, de-duplication, etc. +// +// It also covers both the insert-into-an-empty-slot code path that isn't +// covered elsewhere, and the code path for growing a table to a larger size. +// +// This is the second most important aspect of expected set usage after testing +// for presence. It also nicely lends itself to a single benchmark that covers +// the total cost of this usage pattern. +// +// Because this benchmark operates on whole sets, we also compute the number of +// probed keys for Carbon's set as that is both a general reflection of the +// efficacy of the underlying hash function, and a direct factor that drives the +// cost of these operations. +template +static void BM_SetInsertSeq(benchmark::State& state) { + using SetWrapperT = SetWrapper; + using KT = typename SetWrapperT::KeyT; + constexpr ssize_t LookupKeysSize = 1 << 8; + auto [keys, lookup_keys] = + GetKeysAndHitKeys(state.range(0), LookupKeysSize); + + // Now build a large shuffled set of keys (with duplicates) we'll use at the + // end. + ssize_t i = 0; + for (auto _ : state) { + benchmark::DoNotOptimize(i); + + SetWrapperT s; + for (auto k : keys) { + bool inserted = s.BenchInsert(k); + CARBON_DCHECK(inserted) << "Must be a successful insert!"; + } + + // Now insert a final random repeated key. + bool inserted = s.BenchInsert(lookup_keys[i]); + CARBON_DCHECK(!inserted) << "Must already be in the map!"; + + // Rotate through the shuffled keys. + i = (i + static_cast(!inserted)) & (LookupKeysSize - 1); + } + + // It can be easier in some cases to think of this as a key-throughput rate of + // insertion rather than the latency of inserting N keys, so construct the + // rate counter as well. + state.counters["KeyRate"] = benchmark::Counter( + keys.size(), benchmark::Counter::kIsIterationInvariantRate); + + // Report some extra statistics about the Carbon type. + if constexpr (IsCarbonSet) { + // Re-build a set outside of the timing loop to look at the statistics + // rather than the timing. + SetT s; + for (auto k : keys) { + bool inserted = s.Insert(k).is_inserted(); + CARBON_DCHECK(inserted) << "Must be a successful insert!"; + } + + // While this count is "iteration invariant" (it should be exactly the same + // for every iteration as the set of keys is the same), we don't use that + // because it will scale this by the number of iterations. We want to + // display the probe count of this benchmark *parameter*, not the probe + // count that resulted from the number of iterations. That means we use the + // normal counter API without flags. + state.counters["Probed"] = s.CountProbedKeys(); + + // Uncomment this call to print out statistics about the index-collisions + // among these keys for debugging: + // + // RawHashtable::DumpHashStatistics(raw_keys); + } +} +MAP_BENCHMARK_OP_SEQ(BM_SetInsertSeq); + +} // namespace +} // namespace Carbon diff --git a/common/set_benchmark_test.sh b/common/set_benchmark_test.sh new file mode 100755 index 0000000000000..41f41b716233f --- /dev/null +++ b/common/set_benchmark_test.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# +# Part of the Carbon Language project, under the Apache License v2.0 with LLVM +# Exceptions. See /LICENSE for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/set_benchmark" + +exec "$BENCHMARK" \ + --benchmark_counters_tabular=true \ + --benchmark_min_time=1x \ + --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$' diff --git a/common/set_test.cpp b/common/set_test.cpp new file mode 100644 index 0000000000000..c99056ed31ad9 --- /dev/null +++ b/common/set_test.cpp @@ -0,0 +1,218 @@ +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM +// Exceptions. See /LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "common/set.h" + +#include +#include + +#include +#include +#include + +#include "common/raw_hashtable_test_helpers.h" + +namespace Carbon { +namespace { + +using RawHashtable::IndexKeyContext; +using RawHashtable::TestData; +using ::testing::UnorderedElementsAreArray; + +template +void ExpectSetElementsAre(SetT&& s, MatcherRangeT element_matchers) { + // Collect the elements into a container. + using KeyT = typename std::remove_reference::type::KeyT; + std::vector entries; + s.ForEach([&entries](KeyT& k) { entries.push_back(k); }); + + // Use the GoogleMock unordered container matcher to validate and show errors + // on wrong elements. + EXPECT_THAT(entries, UnorderedElementsAreArray(element_matchers)); +} + +// Allow directly using an initializer list. +template +void ExpectSetElementsAre(SetT&& s, + std::initializer_list element_matchers) { + std::vector element_matchers_storage = element_matchers; + ExpectSetElementsAre(s, element_matchers_storage); +} + +template +auto MakeElements(RangeT&& range, RangeTs&&... ranges) { + std::vector elements; + auto add_range = [&elements](RangeT&& r) { + for (const auto&& e : r) { + elements.push_back(e); + } + }; + add_range(std::forward(range)); + (add_range(std::forward(ranges)), ...); + + return elements; +} + +template +class SetTest : public ::testing::Test {}; + +using Types = ::testing::Types, Set, Set, + Set, Set>; +TYPED_TEST_SUITE(SetTest, Types); + +TYPED_TEST(SetTest, Basic) { + using SetT = TypeParam; + SetT s; + + EXPECT_FALSE(s.Contains(42)); + EXPECT_TRUE(s.Insert(1).is_inserted()); + EXPECT_TRUE(s.Contains(1)); + auto result = s.Lookup(1); + EXPECT_TRUE(result); + EXPECT_EQ(1, result.key()); + auto i_result = s.Insert(1); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_TRUE(s.Contains(1)); + + // Verify all the elements. + ExpectSetElementsAre(s, {1}); + + // Fill up a bunch to ensure we trigger growth a few times. + for (int i : llvm::seq(2, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(s.Insert(i).is_inserted()); + } + for (int i : llvm::seq(1, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(s.Contains(i)); + EXPECT_FALSE(s.Insert(i).is_inserted()); + } + EXPECT_FALSE(s.Contains(513)); + + // Verify all the elements. + ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512))); +} + +TYPED_TEST(SetTest, FactoryAPI) { + using SetT = TypeParam; + SetT s; + EXPECT_TRUE(s.Insert(1, [](int k, void* key_storage) { + return new (key_storage) int(k); + }).is_inserted()); + ASSERT_TRUE(s.Contains(1)); + // Reinsertion doesn't invoke the callback. + EXPECT_FALSE(s.Insert(1, [](int, void*) -> int* { + llvm_unreachable("Should never be called!"); + }).is_inserted()); +} + +TYPED_TEST(SetTest, Copy) { + using SetT = TypeParam; + + SetT s; + // Make sure we exceed the small size for some of the set types, but not all + // of them, so we cover all the combinations of copying between small and + // large. + for (int i : llvm::seq(1, 24)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(s.Insert(i).is_inserted()); + } + + SetT other_s1 = s; + ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24))); + + // Add some more elements to the original. + for (int i : llvm::seq(24, 32)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(s.Insert(i).is_inserted()); + } + + // The first copy doesn't change. + ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24))); + + // A new copy does. + SetT other_s2 = s; + ExpectSetElementsAre(other_s2, MakeElements(llvm::seq(1, 32))); +} + +TYPED_TEST(SetTest, Move) { + using SetT = TypeParam; + + SetT s; + // Make sure we exceed the small size for some of the set types, but not all + // of them, so we cover all the combinations of copying between small and + // large. + for (int i : llvm::seq(1, 24)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(s.Insert(i).is_inserted()); + } + + SetT other_s1 = std::move(s); + ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24))); + + // Add some more elements. + for (int i : llvm::seq(24, 32)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + ASSERT_TRUE(other_s1.Insert(i).is_inserted()); + } + ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 32))); +} + +TYPED_TEST(SetTest, Conversions) { + using SetT = TypeParam; + using KeyT = SetT::KeyT; + SetT s; + ASSERT_TRUE(s.Insert(1).is_inserted()); + ASSERT_TRUE(s.Insert(2).is_inserted()); + ASSERT_TRUE(s.Insert(3).is_inserted()); + ASSERT_TRUE(s.Insert(4).is_inserted()); + + SetView sv = s; + SetView csv = sv; + SetView csv2 = s; + EXPECT_TRUE(sv.Contains(1)); + EXPECT_TRUE(csv.Contains(2)); + EXPECT_TRUE(csv2.Contains(3)); +} + +TEST(SetContextTest, Basic) { + llvm::SmallVector keys; + for (int i : llvm::seq(0, 513)) { + keys.push_back(i * 100); + } + IndexKeyContext key_context(keys); + Set> s; + + EXPECT_FALSE(s.Contains(42, key_context)); + EXPECT_TRUE(s.Insert(1, key_context).is_inserted()); + EXPECT_TRUE(s.Contains(1, key_context)); + auto result = s.Lookup(TestData(100), key_context); + EXPECT_TRUE(result); + EXPECT_EQ(1, result.key()); + auto i_result = s.Insert(1, IndexKeyContext(keys)); + EXPECT_FALSE(i_result.is_inserted()); + EXPECT_TRUE(s.Contains(1, key_context)); + + // Verify all the elements. + ExpectSetElementsAre(s, {1}); + + // Fill up a bunch to ensure we trigger growth a few times. + for (int i : llvm::seq(2, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(s.Insert(i, key_context).is_inserted()); + } + for (int i : llvm::seq(1, 512)) { + SCOPED_TRACE(llvm::formatv("Key: {0}", i).str()); + EXPECT_TRUE(s.Contains(i, key_context)); + EXPECT_FALSE(s.Insert(i, key_context).is_inserted()); + } + EXPECT_FALSE(s.Contains(0, key_context)); + EXPECT_FALSE(s.Contains(512, key_context)); + + // Verify all the elements. + ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512))); +} + +} // namespace +} // namespace Carbon