-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve
LocalVocab
comments and code (#1626)
Major revision of comments, many of which were completely outdated. Also improve variable names and make some minor changes to the code. In particular, the the methods `size()` and `empty()` now run in constant time.
- Loading branch information
1 parent
92d906f
commit 774ea83
Showing
4 changed files
with
142 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,55 @@ | ||
// Copyright 2022, University of Freiburg | ||
// Copyright 2022 - 2024, University of Freiburg | ||
// Chair of Algorithms and Data Structures | ||
// Author: Hannah Bast <[email protected]> | ||
// Authors: Hannah Bast <[email protected]> | ||
// Johannes Kalmbach <[email protected]> | ||
|
||
#include "engine/LocalVocab.h" | ||
|
||
#include "absl/strings/str_cat.h" | ||
#include "global/Id.h" | ||
#include "global/ValueId.h" | ||
#include "util/TransparentFunctors.h" | ||
|
||
// _____________________________________________________________________________ | ||
LocalVocab LocalVocab::clone() const { | ||
LocalVocab clone; | ||
clone.mergeWith(std::span{this, 1}); | ||
return clone; | ||
LocalVocab result; | ||
result.mergeWith(std::span{this, 1}); | ||
AD_CORRECTNESS_CHECK(result.size_ == size_); | ||
return result; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
LocalVocab LocalVocab::merge(std::span<const LocalVocab*> vocabs) { | ||
LocalVocab res; | ||
res.mergeWith(vocabs | | ||
std::views::transform( | ||
[](const LocalVocab* localVocab) -> const LocalVocab& { | ||
return *localVocab; | ||
})); | ||
return res; | ||
LocalVocab result; | ||
result.mergeWith(vocabs | std::views::transform(ad_utility::dereference)); | ||
return result; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
template <typename WordT> | ||
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { | ||
auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word)); | ||
size_ += static_cast<size_t>(isNewWord); | ||
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks | ||
// the MacOS build. | ||
return &(*wordIterator); | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( | ||
const LiteralOrIri& word) { | ||
const LocalVocabEntry& word) { | ||
return getIndexAndAddIfNotContainedImpl(word); | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) { | ||
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( | ||
LocalVocabEntry&& word) { | ||
return getIndexAndAddIfNotContainedImpl(std::move(word)); | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt( | ||
const LiteralOrIri& word) const { | ||
const LocalVocabEntry& word) const { | ||
auto localVocabIndex = primaryWordSet().find(word); | ||
if (localVocabIndex != primaryWordSet().end()) { | ||
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks | ||
|
@@ -60,15 +61,14 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt( | |
} | ||
|
||
// _____________________________________________________________________________ | ||
const LocalVocab::LiteralOrIri& LocalVocab::getWord( | ||
const LocalVocabEntry& LocalVocab::getWord( | ||
LocalVocabIndex localVocabIndex) const { | ||
return *localVocabIndex; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting() | ||
const { | ||
std::vector<LiteralOrIri> result; | ||
std::vector<LocalVocabEntry> LocalVocab::getAllWordsForTesting() const { | ||
std::vector<LocalVocabEntry> result; | ||
std::ranges::copy(primaryWordSet(), std::back_inserter(result)); | ||
for (const auto& previous : otherWordSets_) { | ||
std::ranges::copy(*previous, std::back_inserter(result)); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
// Copyright 2022, University of Freiburg | ||
// Copyright 2022 - 2024, University of Freiburg | ||
// Chair of Algorithms and Data Structures | ||
// Author: Hannah Bast <[email protected]> | ||
// Authors: Hannah Bast <[email protected]> | ||
// Johannes Kalmbach <[email protected]> | ||
|
||
#pragma once | ||
|
||
|
@@ -14,31 +15,39 @@ | |
#include "global/Id.h" | ||
#include "parser/LiteralOrIri.h" | ||
#include "util/BlankNodeManager.h" | ||
#include "util/Exception.h" | ||
|
||
// A class for maintaining a local vocabulary with contiguous (local) IDs. This | ||
// is meant for words that are not part of the normal vocabulary (constructed | ||
// from the input data at indexing time). | ||
// A class for maintaining a local vocabulary, which conceptually is a set of | ||
// `LiteralOrIri`s that are not part of the original vocabulary (which stems | ||
// from the input data). The implementation is subtle and quite clever: | ||
// | ||
|
||
// The entries of the local vocabulary are `LocalVocabEntry`s, each of which | ||
// holds a `LiteralOrIri` and remembers its position in the original vocabulary | ||
// after it has been computed once. | ||
// | ||
// A `LocalVocab` has a primary set of `LocalVocabEntry`s, which can grow | ||
// dynamically, and a collection of other sets of `LocalVocabEntry`s, which | ||
// cannot be modified by this class. A `LocalVocabEntry` lives exactly as long | ||
// as it is contained in at least one of the (primary or other) sets of a | ||
// `LocalVocab`. | ||
class LocalVocab { | ||
private: | ||
using Entry = LocalVocabEntry; | ||
using LiteralOrIri = LocalVocabEntry; | ||
// A map of the words in the local vocabulary to their local IDs. This is a | ||
// node hash map because we need the addresses of the words (which are of type | ||
// `LiteralOrIri`) to remain stable over their lifetime in the hash map | ||
// because we hand out pointers to them. | ||
using Set = absl::node_hash_set<LiteralOrIri>; | ||
// The primary set of `LocalVocabEntry`s, which can grow dynamically. | ||
// | ||
// NOTE: This is a `absl::node_hash_set` because we hand out pointers to | ||
// the `LocalVocabEntry`s and it is hence essential that their addresses | ||
// remain stable over their lifetime in the hash set. | ||
using Set = absl::node_hash_set<LocalVocabEntry>; | ||
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>(); | ||
|
||
// Local vocabularies from child operations that were merged into this | ||
// vocabulary s.t. the pointers are kept alive. They have to be `const` | ||
// because they are possibly shared concurrently (for example via the cache). | ||
// The other sets of `LocalVocabEntry`s, which are static. | ||
std::vector<std::shared_ptr<const Set>> otherWordSets_; | ||
|
||
auto& primaryWordSet() { return *primaryWordSet_; } | ||
const auto& primaryWordSet() const { return *primaryWordSet_; } | ||
// The number of words (so that we can compute `size()` in constant time). | ||
size_t size_ = 0; | ||
|
||
// Each `LocalVocab` has its own `LocalBlankNodeManager` to generate blank | ||
// nodes when needed (e.g., when parsing the result of a SERVICE query). | ||
std::optional<ad_utility::BlankNodeManager::LocalBlankNodeManager> | ||
localBlankNodeManager_; | ||
|
||
|
@@ -50,61 +59,72 @@ class LocalVocab { | |
LocalVocab(const LocalVocab&) = delete; | ||
LocalVocab& operator=(const LocalVocab&) = delete; | ||
|
||
// Make a logical copy. The clone will have an empty primary set so it can | ||
// safely be modified. The contents are copied as shared pointers to const, so | ||
// the function runs in linear time in the number of word sets. | ||
// Make a logical copy, where all sets of `LocalVocabEntry`s become "other" | ||
// sets, that is, they cannot be modified by the copy. The primary set becomes | ||
// empty. This only copies shared pointers and takes time linear in the number | ||
// of sets. | ||
LocalVocab clone() const; | ||
|
||
// Moving a local vocabulary is not problematic (though the typical use case | ||
// in our code is to copy shared pointers to local vocabularies). | ||
// in our code is to copy shared pointers from one `LocalVocab` to another). | ||
LocalVocab(LocalVocab&&) = default; | ||
LocalVocab& operator=(LocalVocab&&) = default; | ||
|
||
// Get the index of a word in the local vocabulary. If the word was already | ||
// contained, return the already existing index. If the word was not yet | ||
// contained, add it, and return the new index. | ||
LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word); | ||
LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word); | ||
// For a given `LocalVocabEntry`, return the corresponding `LocalVocabIndex` | ||
// (which is just the address of the `LocalVocabEntry`). If the | ||
// `LocalVocabEntry` is not contained in any of the sets, add it to the | ||
// primary. | ||
LocalVocabIndex getIndexAndAddIfNotContained(const LocalVocabEntry& word); | ||
LocalVocabIndex getIndexAndAddIfNotContained(LocalVocabEntry&& word); | ||
|
||
// Get the index of a word in the local vocabulary, or std::nullopt if it is | ||
// not contained. This is useful for testing. | ||
// Like `getIndexAndAddIfNotContained`, but if the `LocalVocabEntry` is not | ||
// contained in any of the sets, do not add it and return `std::nullopt`. | ||
std::optional<LocalVocabIndex> getIndexOrNullopt( | ||
const LiteralOrIri& word) const; | ||
const LocalVocabEntry& word) const; | ||
|
||
// The number of words in the vocabulary. | ||
// Note: This is not constant time, but linear in the number of word sets. | ||
// The number of words in this local vocabulary. | ||
size_t size() const { | ||
auto result = primaryWordSet().size(); | ||
for (const auto& previous : otherWordSets_) { | ||
result += previous->size(); | ||
if constexpr (ad_utility::areExpensiveChecksEnabled) { | ||
auto size = primaryWordSet().size(); | ||
for (const auto& previous : otherWordSets_) { | ||
size += previous->size(); | ||
} | ||
AD_CORRECTNESS_CHECK(size == size_); | ||
} | ||
return result; | ||
return size_; | ||
} | ||
|
||
// Return true if and only if the local vocabulary is empty. | ||
bool empty() const { return size() == 0; } | ||
|
||
// Return a const reference to the word. | ||
const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const; | ||
|
||
// Create a local vocab that contains and keeps alive all the words from each | ||
// of the `vocabs`. The primary word set of the newly created vocab is empty. | ||
static LocalVocab merge(std::span<const LocalVocab*> vocabs); | ||
// Get the `LocalVocabEntry` corresponding to the given `LocalVocabIndex`. | ||
// | ||
// NOTE: This used to be a more complex function but is now a simple | ||
// dereference. It could be thrown out in the future. | ||
const LocalVocabEntry& getWord(LocalVocabIndex localVocabIndex) const; | ||
|
||
// Merge all passed local vocabs to keep alive all the words from each of the | ||
// `vocabs`. | ||
// Add all sets (primary and other) of the given local vocabs as other sets | ||
// to this local vocab. The purpose is to keep all the contained | ||
// `LocalVocabEntry`s alive as long as this `LocalVocab` is alive. The | ||
// primary set of this `LocalVocab` remains unchanged. | ||
template <std::ranges::range R> | ||
void mergeWith(const R& vocabs) { | ||
auto inserter = std::back_inserter(otherWordSets_); | ||
using std::views::filter; | ||
for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) { | ||
std::ranges::copy(vocab.otherWordSets_, inserter); | ||
*inserter = vocab.primaryWordSet_; | ||
size_ += vocab.size_; | ||
} | ||
} | ||
|
||
// Return all the words from all the word sets as a vector. | ||
std::vector<LiteralOrIri> getAllWordsForTesting() const; | ||
// Create a new local vocab with empty set and other sets that are the union | ||
// of all sets (primary and other) of the given local vocabs. | ||
static LocalVocab merge(std::span<const LocalVocab*> vocabs); | ||
|
||
// Return all the words from all the word sets as a vector. This is useful | ||
// for testing. | ||
std::vector<LocalVocabEntry> getAllWordsForTesting() const; | ||
|
||
// Get a new BlankNodeIndex using the LocalBlankNodeManager. | ||
[[nodiscard]] BlankNodeIndex getBlankNodeIndex( | ||
|
@@ -115,8 +135,12 @@ class LocalVocab { | |
bool isBlankNodeIndexContained(BlankNodeIndex blankNodeIndex) const; | ||
|
||
private: | ||
// Common implementation for the two variants of | ||
// `getIndexAndAddIfNotContainedImpl` above. | ||
// Accessors for the primary set. | ||
Set& primaryWordSet() { return *primaryWordSet_; } | ||
const Set& primaryWordSet() const { return *primaryWordSet_; } | ||
|
||
// Common implementation for the two methods `getIndexAndAddIfNotContained` | ||
// and `getIndexOrNullopt` above. | ||
template <typename WordT> | ||
LocalVocabIndex getIndexAndAddIfNotContainedImpl(WordT&& word); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.