From 7804d562954abeba472a8da0ba33ca4e53e89912 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Mon, 18 Nov 2024 23:48:18 +0100 Subject: [PATCH 1/3] Improve `LocalVocab` comments and code Major revision of comments, many of which were completely outdated. Also improve variable names and make some minor changes to the code. Make the methods `size()` and `empty()` run in constant time. --- src/engine/LocalVocab.cpp | 43 +++++++------- src/engine/LocalVocab.h | 114 ++++++++++++++++++++---------------- src/index/LocalVocabEntry.h | 2 +- 3 files changed, 88 insertions(+), 71 deletions(-) diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index 1dbc8e7c68..ea16ae3215 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -1,6 +1,7 @@ -// Copyright 2022, University of Freiburg +// Copyright 2022 - 2024, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Hannah Bast +// Authors: Hannah Bast +// Johannes Kalmbach #include "engine/LocalVocab.h" @@ -10,28 +11,28 @@ // _____________________________________________________________________________ LocalVocab LocalVocab::clone() const { - LocalVocab localVocabClone; - localVocabClone.otherWordSets_ = otherWordSets_; - localVocabClone.otherWordSets_.push_back(primaryWordSet_); - // Return the clone. - return localVocabClone; + LocalVocab result; + result.mergeWith(std::span{this, 1}); + AD_CORRECTNESS_CHECK(result.size_ == size_); + return result; } // _____________________________________________________________________________ LocalVocab LocalVocab::merge(std::span vocabs) { - LocalVocab res; - res.mergeWith(vocabs | - std::views::transform( - [](const LocalVocab* localVocab) -> const LocalVocab& { - return *localVocab; - })); - return res; + LocalVocab result; + result.mergeWith(vocabs | + std::views::transform( + [](const LocalVocab* localVocab) -> const LocalVocab& { + return *localVocab; + })); + return result; } // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word)); + size_ += isNewWord; // TODO Use std::to_address (more idiomatic, but currently breaks // the MacOS build. return &(*wordIterator); @@ -39,18 +40,19 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { // _____________________________________________________________________________ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( - const LiteralOrIri& word) { + const LocalVocabEntry& word) { return getIndexAndAddIfNotContainedImpl(word); } // _____________________________________________________________________________ -LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) { +LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( + LocalVocabEntry&& word) { return getIndexAndAddIfNotContainedImpl(std::move(word)); } // _____________________________________________________________________________ std::optional LocalVocab::getIndexOrNullopt( - const LiteralOrIri& word) const { + const LocalVocabEntry& word) const { auto localVocabIndex = primaryWordSet().find(word); if (localVocabIndex != primaryWordSet().end()) { // TODO Use std::to_address (more idiomatic, but currently breaks @@ -62,15 +64,14 @@ std::optional LocalVocab::getIndexOrNullopt( } // _____________________________________________________________________________ -const LocalVocab::LiteralOrIri& LocalVocab::getWord( +const LocalVocabEntry& LocalVocab::getWord( LocalVocabIndex localVocabIndex) const { return *localVocabIndex; } // _____________________________________________________________________________ -std::vector LocalVocab::getAllWordsForTesting() - const { - std::vector result; +std::vector LocalVocab::getAllWordsForTesting() const { + std::vector result; std::ranges::copy(primaryWordSet(), std::back_inserter(result)); for (const auto& previous : otherWordSets_) { std::ranges::copy(*previous, std::back_inserter(result)); diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 3055c400a6..7f9082f3be 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -1,6 +1,7 @@ -// Copyright 2022, University of Freiburg +// Copyright 2022 - 2024, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Hannah Bast +// Authors: Hannah Bast +// Johannes Kalmbach #pragma once @@ -14,31 +15,34 @@ #include "global/Id.h" #include "parser/LiteralOrIri.h" #include "util/BlankNodeManager.h" +#include "util/Exception.h" -// A class for maintaining a local vocabulary with contiguous (local) IDs. This -// is meant for words that are not part of the normal vocabulary (constructed -// from the input data at indexing time). +// A class for maintaining a local vocabulary, which conceptually is a set of +// `LiteralOrIri`s that are not part of the original vocabulary (which stems +// from the input data). The implementation is subtle and quite clever: // - +// The entrys of the local vocabulary are `LocalVocabEntry`s, each of which +// holds a `LiteralOrIri` and remembers its position in the original vocabulary +// after it has been computed once. +// +// A `LocalVocab` has a primary set of `LocalVocabEntry`s, which can grow +// dynamically, and a collection of other sets of `LocalVocabEntry`s, which are +// static. A `LocalVocabEntry` lives exactly as long as it is contained in at +// least one of the (primary or other) sets of a `LocalVocab`. class LocalVocab { private: - using Entry = LocalVocabEntry; - using LiteralOrIri = LocalVocabEntry; - // A map of the words in the local vocabulary to their local IDs. This is a - // node hash map because we need the addresses of the words (which are of type - // `LiteralOrIri`) to remain stable over their lifetime in the hash map - // because we hand out pointers to them. - using Set = absl::node_hash_set; + // The primary set of `LocalVocabEntry`s, which can grow dynamically. + using Set = absl::node_hash_set; std::shared_ptr primaryWordSet_ = std::make_shared(); - // Local vocabularies from child operations that were merged into this - // vocabulary s.t. the pointers are kept alive. They have to be `const` - // because they are possibly shared concurrently (for example via the cache). + // The other sets of `LocalVocabEntry`s, which are static. std::vector> otherWordSets_; - auto& primaryWordSet() { return *primaryWordSet_; } - const auto& primaryWordSet() const { return *primaryWordSet_; } + // The number of words (so that we can compute `size()` in constant time). + size_t size_ = 0; + // Each `LocalVocab` has its own `LocalBlankNodeManager` to generate blank + // nodes when needed (e.g., when parsing the result of a SERVICE query). std::optional localBlankNodeManager_; @@ -50,60 +54,68 @@ class LocalVocab { LocalVocab(const LocalVocab&) = delete; LocalVocab& operator=(const LocalVocab&) = delete; - // Make a logical copy. The clone will have an empty primary set so it can - // safely be modified. The contents are copied as shared pointers to const, so - // the function runs in linear time in the number of word sets. + // Make a logical copy, where all sets of `LocalVocabEntry`s become static + // and the primary set becomes empty. This only copies shared pointers and + // takes time linear in the number of sets. LocalVocab clone() const; // Moving a local vocabulary is not problematic (though the typical use case - // in our code is to copy shared pointers to local vocabularies). + // in our code is to copy shared pointers from one `LocalVocab` to another). LocalVocab(LocalVocab&&) = default; LocalVocab& operator=(LocalVocab&&) = default; - // Get the index of a word in the local vocabulary. If the word was already - // contained, return the already existing index. If the word was not yet - // contained, add it, and return the new index. - LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word); - LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word); + // For a given `LocalVocabEntry`, return the corresponding `LocalVocabIndex` + // (which is just the address of the `LocalVocabEntry`). If the + // `LocalVocabEntry` is not contained in any of the sets, add it to the + // primary. + LocalVocabIndex getIndexAndAddIfNotContained(const LocalVocabEntry& word); + LocalVocabIndex getIndexAndAddIfNotContained(LocalVocabEntry&& word); - // Get the index of a word in the local vocabulary, or std::nullopt if it is - // not contained. This is useful for testing. + // Like `getIndexAndAddIfNotContained`, but if the `LocalVocabEntry` is not + // contained in any of the sets, do not add it and return `std::nullopt`. std::optional getIndexOrNullopt( - const LiteralOrIri& word) const; + const LocalVocabEntry& word) const; - // The number of words in the vocabulary. - // Note: This is not constant time, but linear in the number of word sets. + // The number of words in this local vocabulary. size_t size() const { - auto result = primaryWordSet().size(); - for (const auto& previous : otherWordSets_) { - result += previous->size(); + if constexpr (ad_utility::areExpensiveChecksEnabled) { + auto size = primaryWordSet().size(); + for (const auto& previous : otherWordSets_) { + size += previous->size(); + } + AD_CORRECTNESS_CHECK(size == size_); } - return result; + return size_; } // Return true if and only if the local vocabulary is empty. bool empty() const { return size() == 0; } - // Return a const reference to the word. - const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const; - - // Create a local vocab that contains and keeps alive all the words from each - // of the `vocabs`. The primary word set of the newly created vocab is empty. - static LocalVocab merge(std::span vocabs); + // Get the `LocalVocabEntry` corresponding to the given `LocalVocabIndex`. + const LocalVocabEntry& getWord(LocalVocabIndex localVocabIndex) const; - // Merge all passed local vocabs to keep alive all the words from each of the - // `vocabs`. + // Add all sets (primary and other) of the given local vocabs as other sets + // to this local vocab. The purpose is to keep all the contained + // `LocalVocabEntry`s alive as long as this `LocalVocab` is alive. The + // primary set of this `LocalVocab` remains unchanged. template void mergeWith(const R& vocabs) { auto inserter = std::back_inserter(otherWordSets_); - for (const auto& vocab : vocabs) { + using std::views::filter; + for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) { std::ranges::copy(vocab.otherWordSets_, inserter); *inserter = vocab.primaryWordSet_; + size_ += vocab.size_; } } - // Return all the words from all the word sets as a vector. - std::vector getAllWordsForTesting() const; + // Create a new local vocab with empty set and other sets that are the union + // of all sets (primary and other) of the given local vocabs. + static LocalVocab merge(std::span vocabs); + + // Return all the words from all the word sets as a vector. This is useful + // for testing. + std::vector getAllWordsForTesting() const; // Get a new BlankNodeIndex using the LocalBlankNodeManager. [[nodiscard]] BlankNodeIndex getBlankNodeIndex( @@ -114,8 +126,12 @@ class LocalVocab { bool isBlankNodeIndexContained(BlankNodeIndex blankNodeIndex) const; private: - // Common implementation for the two variants of - // `getIndexAndAddIfNotContainedImpl` above. + // Accessors for the primary set. + Set& primaryWordSet() { return *primaryWordSet_; } + const Set& primaryWordSet() const { return *primaryWordSet_; } + + // Common implementation for the two methods `getIndexAndAddIfNotContained` + // and `getIndexOrNullopt` above. template LocalVocabIndex getIndexAndAddIfNotContainedImpl(WordT&& word); }; diff --git a/src/index/LocalVocabEntry.h b/src/index/LocalVocabEntry.h index 545d8a8350..e591ad64ff 100644 --- a/src/index/LocalVocabEntry.h +++ b/src/index/LocalVocabEntry.h @@ -27,7 +27,7 @@ class alignas(16) LocalVocabEntry // the first *larger* word in the vocabulary. Note: we store the cache as // three separate atomics to avoid mutexes. The downside is, that in parallel // code multiple threads might look up the position concurrently, which wastes - // a bit of resources. We however don't consider this case to be likely. + // a bit of resources. However, we don't consider this case to be likely. mutable ad_utility::CopyableAtomic lowerBoundInVocab_; mutable ad_utility::CopyableAtomic upperBoundInVocab_; mutable ad_utility::CopyableAtomic positionInVocabKnown_ = false; From 220b7e047321f1800ff35819294da1ea396547c0 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 19 Nov 2024 01:10:41 +0100 Subject: [PATCH 2/3] Fix the obligatory typo --- src/engine/LocalVocab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 7f9082f3be..072b8ab857 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -21,7 +21,7 @@ // `LiteralOrIri`s that are not part of the original vocabulary (which stems // from the input data). The implementation is subtle and quite clever: // -// The entrys of the local vocabulary are `LocalVocabEntry`s, each of which +// The entries of the local vocabulary are `LocalVocabEntry`s, each of which // holds a `LiteralOrIri` and remembers its position in the original vocabulary // after it has been computed once. // From f4ccd87bda024d6c19098bf8fafdfa7fee942267 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 19 Nov 2024 11:43:30 +0100 Subject: [PATCH 3/3] Try to make SonarCloud happy --- src/engine/LocalVocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index ea16ae3215..57eaab68a6 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -32,7 +32,7 @@ LocalVocab LocalVocab::merge(std::span vocabs) { template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word)); - size_ += isNewWord; + size_ += static_cast(isNewWord); // TODO Use std::to_address (more idiomatic, but currently breaks // the MacOS build. return &(*wordIterator);