Skip to content

Commit

Permalink
Improve LocalVocab comments and code
Browse files Browse the repository at this point in the history
Major revision of comments, many of which were completely outdated. Also
improve variable names and make some minor changes to the code.

Make the methods `size()` and `empty()` run in constant time.
  • Loading branch information
Hannah Bast committed Nov 18, 2024
1 parent 55e0617 commit 7804d56
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 71 deletions.
43 changes: 22 additions & 21 deletions src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2022, University of Freiburg
// Copyright 2022 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <[email protected]>
// Authors: Hannah Bast <[email protected]>
// Johannes Kalmbach <[email protected]>

#include "engine/LocalVocab.h"

Expand All @@ -10,47 +11,48 @@

// _____________________________________________________________________________
LocalVocab LocalVocab::clone() const {
LocalVocab localVocabClone;
localVocabClone.otherWordSets_ = otherWordSets_;
localVocabClone.otherWordSets_.push_back(primaryWordSet_);
// Return the clone.
return localVocabClone;
LocalVocab result;
result.mergeWith(std::span{this, 1});
AD_CORRECTNESS_CHECK(result.size_ == size_);
return result;
}

// _____________________________________________________________________________
LocalVocab LocalVocab::merge(std::span<const LocalVocab*> vocabs) {
LocalVocab res;
res.mergeWith(vocabs |
std::views::transform(
[](const LocalVocab* localVocab) -> const LocalVocab& {
return *localVocab;
}));
return res;
LocalVocab result;
result.mergeWith(vocabs |
std::views::transform(
[](const LocalVocab* localVocab) -> const LocalVocab& {
return *localVocab;
}));
return result;
}

// _____________________________________________________________________________
template <typename WordT>
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word));
size_ += isNewWord;
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
// the MacOS build.
return &(*wordIterator);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(
const LiteralOrIri& word) {
const LocalVocabEntry& word) {
return getIndexAndAddIfNotContainedImpl(word);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) {
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(
LocalVocabEntry&& word) {
return getIndexAndAddIfNotContainedImpl(std::move(word));
}

// _____________________________________________________________________________
std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
const LiteralOrIri& word) const {
const LocalVocabEntry& word) const {
auto localVocabIndex = primaryWordSet().find(word);
if (localVocabIndex != primaryWordSet().end()) {
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
Expand All @@ -62,15 +64,14 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
}

// _____________________________________________________________________________
const LocalVocab::LiteralOrIri& LocalVocab::getWord(
const LocalVocabEntry& LocalVocab::getWord(
LocalVocabIndex localVocabIndex) const {
return *localVocabIndex;
}

// _____________________________________________________________________________
std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting()
const {
std::vector<LiteralOrIri> result;
std::vector<LocalVocabEntry> LocalVocab::getAllWordsForTesting() const {
std::vector<LocalVocabEntry> result;
std::ranges::copy(primaryWordSet(), std::back_inserter(result));
for (const auto& previous : otherWordSets_) {
std::ranges::copy(*previous, std::back_inserter(result));
Expand Down
114 changes: 65 additions & 49 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2022, University of Freiburg
// Copyright 2022 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <[email protected]>
// Authors: Hannah Bast <[email protected]>
// Johannes Kalmbach <[email protected]>

#pragma once

Expand All @@ -14,31 +15,34 @@
#include "global/Id.h"
#include "parser/LiteralOrIri.h"
#include "util/BlankNodeManager.h"
#include "util/Exception.h"

// A class for maintaining a local vocabulary with contiguous (local) IDs. This
// is meant for words that are not part of the normal vocabulary (constructed
// from the input data at indexing time).
// A class for maintaining a local vocabulary, which conceptually is a set of
// `LiteralOrIri`s that are not part of the original vocabulary (which stems
// from the input data). The implementation is subtle and quite clever:
//

// The entrys of the local vocabulary are `LocalVocabEntry`s, each of which

Check failure on line 24 in src/engine/LocalVocab.h

View workflow job for this annotation

GitHub Actions / Check for spelling errors

entrys ==> entries, entry
// holds a `LiteralOrIri` and remembers its position in the original vocabulary
// after it has been computed once.
//
// A `LocalVocab` has a primary set of `LocalVocabEntry`s, which can grow
// dynamically, and a collection of other sets of `LocalVocabEntry`s, which are
// static. A `LocalVocabEntry` lives exactly as long as it is contained in at
// least one of the (primary or other) sets of a `LocalVocab`.
class LocalVocab {
private:
using Entry = LocalVocabEntry;
using LiteralOrIri = LocalVocabEntry;
// A map of the words in the local vocabulary to their local IDs. This is a
// node hash map because we need the addresses of the words (which are of type
// `LiteralOrIri`) to remain stable over their lifetime in the hash map
// because we hand out pointers to them.
using Set = absl::node_hash_set<LiteralOrIri>;
// The primary set of `LocalVocabEntry`s, which can grow dynamically.
using Set = absl::node_hash_set<LocalVocabEntry>;
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();

// Local vocabularies from child operations that were merged into this
// vocabulary s.t. the pointers are kept alive. They have to be `const`
// because they are possibly shared concurrently (for example via the cache).
// The other sets of `LocalVocabEntry`s, which are static.
std::vector<std::shared_ptr<const Set>> otherWordSets_;

auto& primaryWordSet() { return *primaryWordSet_; }
const auto& primaryWordSet() const { return *primaryWordSet_; }
// The number of words (so that we can compute `size()` in constant time).
size_t size_ = 0;

// Each `LocalVocab` has its own `LocalBlankNodeManager` to generate blank
// nodes when needed (e.g., when parsing the result of a SERVICE query).
std::optional<ad_utility::BlankNodeManager::LocalBlankNodeManager>
localBlankNodeManager_;

Expand All @@ -50,60 +54,68 @@ class LocalVocab {
LocalVocab(const LocalVocab&) = delete;
LocalVocab& operator=(const LocalVocab&) = delete;

// Make a logical copy. The clone will have an empty primary set so it can
// safely be modified. The contents are copied as shared pointers to const, so
// the function runs in linear time in the number of word sets.
// Make a logical copy, where all sets of `LocalVocabEntry`s become static
// and the primary set becomes empty. This only copies shared pointers and
// takes time linear in the number of sets.
LocalVocab clone() const;

// Moving a local vocabulary is not problematic (though the typical use case
// in our code is to copy shared pointers to local vocabularies).
// in our code is to copy shared pointers from one `LocalVocab` to another).
LocalVocab(LocalVocab&&) = default;
LocalVocab& operator=(LocalVocab&&) = default;

// Get the index of a word in the local vocabulary. If the word was already
// contained, return the already existing index. If the word was not yet
// contained, add it, and return the new index.
LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word);
LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word);
// For a given `LocalVocabEntry`, return the corresponding `LocalVocabIndex`
// (which is just the address of the `LocalVocabEntry`). If the
// `LocalVocabEntry` is not contained in any of the sets, add it to the
// primary.
LocalVocabIndex getIndexAndAddIfNotContained(const LocalVocabEntry& word);
LocalVocabIndex getIndexAndAddIfNotContained(LocalVocabEntry&& word);

// Get the index of a word in the local vocabulary, or std::nullopt if it is
// not contained. This is useful for testing.
// Like `getIndexAndAddIfNotContained`, but if the `LocalVocabEntry` is not
// contained in any of the sets, do not add it and return `std::nullopt`.
std::optional<LocalVocabIndex> getIndexOrNullopt(
const LiteralOrIri& word) const;
const LocalVocabEntry& word) const;

// The number of words in the vocabulary.
// Note: This is not constant time, but linear in the number of word sets.
// The number of words in this local vocabulary.
size_t size() const {
auto result = primaryWordSet().size();
for (const auto& previous : otherWordSets_) {
result += previous->size();
if constexpr (ad_utility::areExpensiveChecksEnabled) {
auto size = primaryWordSet().size();
for (const auto& previous : otherWordSets_) {
size += previous->size();
}
AD_CORRECTNESS_CHECK(size == size_);
}
return result;
return size_;
}

// Return true if and only if the local vocabulary is empty.
bool empty() const { return size() == 0; }

// Return a const reference to the word.
const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const;

// Create a local vocab that contains and keeps alive all the words from each
// of the `vocabs`. The primary word set of the newly created vocab is empty.
static LocalVocab merge(std::span<const LocalVocab*> vocabs);
// Get the `LocalVocabEntry` corresponding to the given `LocalVocabIndex`.
const LocalVocabEntry& getWord(LocalVocabIndex localVocabIndex) const;

// Merge all passed local vocabs to keep alive all the words from each of the
// `vocabs`.
// Add all sets (primary and other) of the given local vocabs as other sets
// to this local vocab. The purpose is to keep all the contained
// `LocalVocabEntry`s alive as long as this `LocalVocab` is alive. The
// primary set of this `LocalVocab` remains unchanged.
template <std::ranges::range R>
void mergeWith(const R& vocabs) {
auto inserter = std::back_inserter(otherWordSets_);
for (const auto& vocab : vocabs) {
using std::views::filter;
for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) {
std::ranges::copy(vocab.otherWordSets_, inserter);
*inserter = vocab.primaryWordSet_;
size_ += vocab.size_;
}
}

// Return all the words from all the word sets as a vector.
std::vector<LiteralOrIri> getAllWordsForTesting() const;
// Create a new local vocab with empty set and other sets that are the union
// of all sets (primary and other) of the given local vocabs.
static LocalVocab merge(std::span<const LocalVocab*> vocabs);

// Return all the words from all the word sets as a vector. This is useful
// for testing.
std::vector<LocalVocabEntry> getAllWordsForTesting() const;

// Get a new BlankNodeIndex using the LocalBlankNodeManager.
[[nodiscard]] BlankNodeIndex getBlankNodeIndex(
Expand All @@ -114,8 +126,12 @@ class LocalVocab {
bool isBlankNodeIndexContained(BlankNodeIndex blankNodeIndex) const;

private:
// Common implementation for the two variants of
// `getIndexAndAddIfNotContainedImpl` above.
// Accessors for the primary set.
Set& primaryWordSet() { return *primaryWordSet_; }
const Set& primaryWordSet() const { return *primaryWordSet_; }

// Common implementation for the two methods `getIndexAndAddIfNotContained`
// and `getIndexOrNullopt` above.
template <typename WordT>
LocalVocabIndex getIndexAndAddIfNotContainedImpl(WordT&& word);
};
2 changes: 1 addition & 1 deletion src/index/LocalVocabEntry.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class alignas(16) LocalVocabEntry
// the first *larger* word in the vocabulary. Note: we store the cache as
// three separate atomics to avoid mutexes. The downside is, that in parallel
// code multiple threads might look up the position concurrently, which wastes
// a bit of resources. We however don't consider this case to be likely.
// a bit of resources. However, we don't consider this case to be likely.
mutable ad_utility::CopyableAtomic<VocabIndex> lowerBoundInVocab_;
mutable ad_utility::CopyableAtomic<VocabIndex> upperBoundInVocab_;
mutable ad_utility::CopyableAtomic<bool> positionInVocabKnown_ = false;
Expand Down

0 comments on commit 7804d56

Please sign in to comment.