Skip to content

Commit

Permalink
Introduce custom hash table data structures. (carbon-language#3940)
Browse files Browse the repository at this point in the history
The hash table design is heavily based on Abseil's ["Swiss
Tables"][swiss-tables] design. It uses an array of bytes storing
metadata about each entry and an array of entries where each is a pair
of key and value. The metadata byte consists of 7-bits of hash of the
key (distinct from the bits used to index the table), and one bit
indicating the presence of a special entry -- either empty or deleted.

[swiss-tables]: https://abseil.io/about/design/swisstables

There are a large range of optimizations and other nuanced aspects of
this hash table design and implementation, a good point to understand
that context is `raw_hashtable.h` which has an overview of the design
and references to various other files for relevant details.

---------

Co-authored-by: josh11b <[email protected]>
  • Loading branch information
chandlerc and josh11b authored Jun 8, 2024
1 parent ffc3327 commit 21a81bc
Show file tree
Hide file tree
Showing 24 changed files with 6,426 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_ti
build:clang-tidy --output_groups=report
build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config

# This warning seems to incorrectly fire in this build configuration, despite
# not firing in our normal builds.
build:clang-tidy --copt=-Wno-unknown-pragmas

# Default to using a disk cache to minimize re-building LLVM and Clang which we
# try to avoid updating too frequently to minimize rebuild cost. The location
# here can be overridden in the user configuration where needed.
Expand Down
1 change: 1 addition & 0 deletions .codespell_ignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ createor
crossreference
falsy
forin
groupt
inout
parameteras
pullrequest
Expand Down
175 changes: 175 additions & 0 deletions common/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,14 @@ cc_binary(
],
)

cc_library(
name = "hashtable_key_context",
hdrs = ["hashtable_key_context.h"],
deps = [
":hashing",
],
)

cc_library(
name = "indirect_value",
hdrs = ["indirect_value.h"],
Expand Down Expand Up @@ -224,6 +232,53 @@ cc_library(
alwayslink = 1,
)

cc_library(
name = "map",
hdrs = ["map.h"],
deps = [
":check",
":hashtable_key_context",
":raw_hashtable",
"@llvm-project//llvm:Support",
],
)

cc_test(
name = "map_test",
srcs = ["map_test.cpp"],
deps = [
":map",
":raw_hashtable_test_helpers",
"//testing/base:gtest_main",
"//testing/base:test_raw_ostream",
"@googletest//:gtest",
],
)

cc_binary(
name = "map_benchmark",
testonly = 1,
srcs = ["map_benchmark.cpp"],
deps = [
":map",
":raw_hashtable_benchmark_helpers",
"@abseil-cpp//absl/container:flat_hash_map",
"@abseil-cpp//absl/random",
"@google_benchmark//:benchmark_main",
"@llvm-project//llvm:Support",
],
)

sh_test(
name = "map_benchmark_test",
# The benchmark allocates a large amount of memory.
size = "enormous",
# We configure the test to run quickly.
timeout = "short",
srcs = ["map_benchmark_test.sh"],
data = [":map_benchmark"],
)

cc_library(
name = "ostream",
hdrs = ["ostream.h"],
Expand All @@ -232,6 +287,126 @@ cc_library(
],
)

cc_library(
name = "raw_hashtable",
srcs = ["raw_hashtable.cpp"],
hdrs = ["raw_hashtable.h"],
deps = [
":check",
":hashing",
":hashtable_key_context",
":raw_hashtable_metadata_group",
"@llvm-project//llvm:Support",
],
)

cc_library(
name = "raw_hashtable_metadata_group",
srcs = ["raw_hashtable_metadata_group.cpp"],
hdrs = ["raw_hashtable_metadata_group.h"],
deps = [
":check",
"@llvm-project//llvm:Support",
],
)

cc_binary(
name = "raw_hashtable_metadata_group_benchmark",
testonly = 1,
srcs = ["raw_hashtable_metadata_group_benchmark.cpp"],
deps = [
":raw_hashtable_metadata_group",
"@abseil-cpp//absl/random",
"@google_benchmark//:benchmark_main",
"@llvm-project//llvm:Support",
],
)

sh_test(
name = "raw_hashtable_metadata_group_benchmark_test",
srcs = ["raw_hashtable_metadata_group_benchmark_test.sh"],
data = [":raw_hashtable_metadata_group_benchmark"],
)

cc_library(
name = "raw_hashtable_benchmark_helpers",
testonly = 1,
srcs = ["raw_hashtable_benchmark_helpers.cpp"],
hdrs = ["raw_hashtable_benchmark_helpers.h"],
copts = [
"-O2", # Always optimize to make testing benchmarks faster.
],
deps = [
":check",
":hashing",
":raw_hashtable",
":set",
"@abseil-cpp//absl/base:no_destructor",
"@abseil-cpp//absl/hash",
"@abseil-cpp//absl/random",
"@google_benchmark//:benchmark",
"@llvm-project//llvm:Support",
],
)

cc_library(
name = "raw_hashtable_test_helpers",
testonly = 1,
hdrs = ["raw_hashtable_test_helpers.h"],
deps = [
":check",
":hashing",
":hashtable_key_context",
":ostream",
],
)

cc_library(
name = "set",
hdrs = ["set.h"],
deps = [
":check",
":hashtable_key_context",
":raw_hashtable",
"@llvm-project//llvm:Support",
],
)

cc_test(
name = "set_test",
srcs = ["set_test.cpp"],
deps = [
":raw_hashtable_test_helpers",
":set",
"//testing/base:gtest_main",
"//testing/base:test_raw_ostream",
"@googletest//:gtest",
],
)

cc_binary(
name = "set_benchmark",
testonly = 1,
srcs = ["set_benchmark.cpp"],
deps = [
":raw_hashtable_benchmark_helpers",
":set",
"@abseil-cpp//absl/container:flat_hash_set",
"@google_benchmark//:benchmark_main",
"@llvm-project//llvm:Support",
],
)

sh_test(
name = "set_benchmark_test",
# The benchmark allocates a large amount of memory.
size = "enormous",
# We configure the test to run quickly.
timeout = "short",
srcs = ["set_benchmark_test.sh"],
data = [":set_benchmark"],
)

cc_library(
name = "string_helpers",
srcs = ["string_helpers.cpp"],
Expand Down
4 changes: 2 additions & 2 deletions common/hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,9 +573,9 @@ constexpr auto HashCode::ExtractIndex() -> ssize_t { return value_; }
template <int N>
constexpr auto HashCode::ExtractIndexAndTag() -> std::pair<ssize_t, uint32_t> {
static_assert(N >= 1);
static_assert(N <= 32);
static_assert(N < 32);
return {static_cast<ssize_t>(value_ >> N),
static_cast<uint32_t>(value_ & ((1U << (N + 1)) - 1))};
static_cast<uint32_t>(value_ & ((1U << N) - 1))};
}

// Building with `-DCARBON_MCA_MARKERS` will enable `llvm-mca` annotations in
Expand Down
6 changes: 6 additions & 0 deletions common/hashing_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ TEST(HashingTest, HashCodeAPI) {
EXPECT_THAT(a.ExtractIndex(), Ne(b.ExtractIndex()));
EXPECT_THAT(a.ExtractIndex(), Ne(empty.ExtractIndex()));

// The tag shouldn't have bits set outside the range requested.
EXPECT_THAT(HashValue("a").ExtractIndexAndTag<1>().second & ~0b1, Eq(0));
EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>().second & ~0b11, Eq(0));
EXPECT_THAT(HashValue("a").ExtractIndexAndTag<3>().second & ~0b111, Eq(0));
EXPECT_THAT(HashValue("a").ExtractIndexAndTag<4>().second & ~0b1111, Eq(0));

// Note that the index produced with a tag may be different from the index
// alone!
EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>(),
Expand Down
85 changes: 85 additions & 0 deletions common/hashtable_key_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
#define CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_

#include "common/hashing.h"

namespace Carbon {

// Customizable context for keys in hashtables.
//
// This type or customizations matching its API are used with the data
// structures in `map.h` and `set.h`. By providing a custom version of the
// `KeyContext` type parameter to those data structures, users can provide
// either stateless or stateful customization of the two core hashtable key
// operations: hashing and comparison.
//
// The default for hashing uses Carbon's `hashing.h`. Customizations must still
// return a `HashCode` as defined there, and it needs to have the same core
// properties of hashes produced by the `hashing.h` infrastructure.
//
// The default for comparison is `operator==`. The `KeyEq` method is always
// called with a key *stored in the hashtable* as the second or "RHS" parameter.
// This is to allow simplifying the set of overloads needed for heterogeneous
// contexts: only the first, LHS, parameter needs to support different lookup
// key types.
//
// Custom KeyContext types should have the the same API as the default type.
// They can choose to use templates to support heterogeneous key types or not as
// appropriate. The default context can also be used as a base class with only
// one or the other APIs customized.
//
// An important consideration is how the key context is constructed. When the
// key context can be default constructed, hashtable APIs trafficking in keys
// will have overloads that provide a default constructed key context. When the
// context is *not* default constructible, every API that accepts a key will
// also require a context argument to be called, and that argument will be used
// throughout that operation. The intent is to allow callers to provide stateful
// contexts to each API where it would be needed, while managing that state
// outside the hashtable. Often the needed state is trivially part of the
// caller's existing state and needn't be stored separately.
//
// Example for a stateful, customized key context for interned strings:
// ```cpp
// class InternedStringIndexKeyContext {
// public:
// InternedStringIndexKeyContext(
// llvm::ArrayRef<llvm::StringRef> interned_strings)
// : interned_strings_(interned_strings) {}
//
// auto HashKey(llvm::StringRef s, uint64_t seed) const -> HashCode {
// return HashValue(s);
// }
// auto HashKey(int index_key, uint64_t seed) const -> HashCode {
// return HashKey(interned_strings_[index_key]);
// }
//
// auto KeyEq(llvm::StringRef lhs, int rhs_index) const -> bool {
// return lhs == interned_strings_[rhs_index];
// }
// auto KeyEq(int lhs_index, int rhs_index) const -> bool {
// return KeyEq(interned_strings_[lhs_index], rhs_index);
// }
//
// private:
// llvm::ArrayRef<llvm::StringRef> interned_strings_;
// };
// ```
struct DefaultKeyContext {
template <typename KeyT>
auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
return HashValue(key, seed);
}

template <typename LHSKeyT, typename RHSKeyT>
auto KeyEq(const LHSKeyT& lhs_key, const RHSKeyT& rhs_key) const -> bool {
return lhs_key == rhs_key;
}
};

} // namespace Carbon

#endif // CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
Loading

0 comments on commit 21a81bc

Please sign in to comment.