Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into AD_LOG
Browse files Browse the repository at this point in the history
  • Loading branch information
Hannah Bast committed Nov 21, 2024
2 parents 032eade + d53d4f9 commit e1bb1f5
Show file tree
Hide file tree
Showing 12 changed files with 835 additions and 167 deletions.
44 changes: 22 additions & 22 deletions src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
@@ -1,56 +1,55 @@
// Copyright 2022, University of Freiburg
// Copyright 2022 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <[email protected]>
// Authors: Hannah Bast <[email protected]>
// Johannes Kalmbach <[email protected]>

#include "engine/LocalVocab.h"

#include "absl/strings/str_cat.h"
#include "global/Id.h"
#include "global/ValueId.h"
#include "util/TransparentFunctors.h"

// _____________________________________________________________________________
LocalVocab LocalVocab::clone() const {
LocalVocab localVocabClone;
localVocabClone.otherWordSets_ = otherWordSets_;
localVocabClone.otherWordSets_.push_back(primaryWordSet_);
// Return the clone.
return localVocabClone;
LocalVocab result;
result.mergeWith(std::span{this, 1});
AD_CORRECTNESS_CHECK(result.size_ == size_);
return result;
}

// _____________________________________________________________________________
LocalVocab LocalVocab::merge(std::span<const LocalVocab*> vocabs) {
LocalVocab res;
res.mergeWith(vocabs |
std::views::transform(
[](const LocalVocab* localVocab) -> const LocalVocab& {
return *localVocab;
}));
return res;
LocalVocab result;
result.mergeWith(vocabs | std::views::transform(ad_utility::dereference));
return result;
}

// _____________________________________________________________________________
template <typename WordT>
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word));
size_ += static_cast<size_t>(isNewWord);
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
// the MacOS build.
return &(*wordIterator);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(
const LiteralOrIri& word) {
const LocalVocabEntry& word) {
return getIndexAndAddIfNotContainedImpl(word);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) {
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(
LocalVocabEntry&& word) {
return getIndexAndAddIfNotContainedImpl(std::move(word));
}

// _____________________________________________________________________________
std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
const LiteralOrIri& word) const {
const LocalVocabEntry& word) const {
auto localVocabIndex = primaryWordSet().find(word);
if (localVocabIndex != primaryWordSet().end()) {
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
Expand All @@ -62,15 +61,14 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
}

// _____________________________________________________________________________
const LocalVocab::LiteralOrIri& LocalVocab::getWord(
const LocalVocabEntry& LocalVocab::getWord(
LocalVocabIndex localVocabIndex) const {
return *localVocabIndex;
}

// _____________________________________________________________________________
std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting()
const {
std::vector<LiteralOrIri> result;
std::vector<LocalVocabEntry> LocalVocab::getAllWordsForTesting() const {
std::vector<LocalVocabEntry> result;
std::ranges::copy(primaryWordSet(), std::back_inserter(result));
for (const auto& previous : otherWordSets_) {
std::ranges::copy(*previous, std::back_inserter(result));
Expand All @@ -84,7 +82,9 @@ BlankNodeIndex LocalVocab::getBlankNodeIndex(
AD_CONTRACT_CHECK(blankNodeManager);
// Initialize the `localBlankNodeManager_` if it doesn't exist yet.
if (!localBlankNodeManager_) [[unlikely]] {
localBlankNodeManager_.emplace(blankNodeManager);
localBlankNodeManager_ =
std::make_shared<ad_utility::BlankNodeManager::LocalBlankNodeManager>(
blankNodeManager);
}
return BlankNodeIndex::make(localBlankNodeManager_->getId());
}
Expand Down
147 changes: 97 additions & 50 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
// Copyright 2022, University of Freiburg
// Copyright 2022 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Hannah Bast <[email protected]>
// Authors: Hannah Bast <[email protected]>
// Johannes Kalmbach <[email protected]>

#pragma once

#include <algorithm>
#include <cstdlib>
#include <memory>
#include <ranges>
#include <span>
#include <string>
#include <vector>
Expand All @@ -14,32 +17,40 @@
#include "global/Id.h"
#include "parser/LiteralOrIri.h"
#include "util/BlankNodeManager.h"
#include "util/Exception.h"

// A class for maintaining a local vocabulary with contiguous (local) IDs. This
// is meant for words that are not part of the normal vocabulary (constructed
// from the input data at indexing time).
// A class for maintaining a local vocabulary, which conceptually is a set of
// `LiteralOrIri`s that are not part of the original vocabulary (which stems
// from the input data). The implementation is subtle and quite clever:
//

// The entries of the local vocabulary are `LocalVocabEntry`s, each of which
// holds a `LiteralOrIri` and remembers its position in the original vocabulary
// after it has been computed once.
//
// A `LocalVocab` has a primary set of `LocalVocabEntry`s, which can grow
// dynamically, and a collection of other sets of `LocalVocabEntry`s, which
// cannot be modified by this class. A `LocalVocabEntry` lives exactly as long
// as it is contained in at least one of the (primary or other) sets of a
// `LocalVocab`.
class LocalVocab {
private:
using Entry = LocalVocabEntry;
using LiteralOrIri = LocalVocabEntry;
// A map of the words in the local vocabulary to their local IDs. This is a
// node hash map because we need the addresses of the words (which are of type
// `LiteralOrIri`) to remain stable over their lifetime in the hash map
// because we hand out pointers to them.
using Set = absl::node_hash_set<LiteralOrIri>;
// The primary set of `LocalVocabEntry`s, which can grow dynamically.
//
// NOTE: This is a `absl::node_hash_set` because we hand out pointers to
// the `LocalVocabEntry`s and it is hence essential that their addresses
// remain stable over their lifetime in the hash set.
using Set = absl::node_hash_set<LocalVocabEntry>;
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();

// Local vocabularies from child operations that were merged into this
// vocabulary s.t. the pointers are kept alive. They have to be `const`
// because they are possibly shared concurrently (for example via the cache).
// The other sets of `LocalVocabEntry`s, which are static.
std::vector<std::shared_ptr<const Set>> otherWordSets_;

auto& primaryWordSet() { return *primaryWordSet_; }
const auto& primaryWordSet() const { return *primaryWordSet_; }
// The number of words (so that we can compute `size()` in constant time).
size_t size_ = 0;

std::optional<ad_utility::BlankNodeManager::LocalBlankNodeManager>
// Each `LocalVocab` has its own `LocalBlankNodeManager` to generate blank
// nodes when needed (e.g., when parsing the result of a SERVICE query).
std::shared_ptr<ad_utility::BlankNodeManager::LocalBlankNodeManager>
localBlankNodeManager_;

public:
Expand All @@ -50,60 +61,92 @@ class LocalVocab {
LocalVocab(const LocalVocab&) = delete;
LocalVocab& operator=(const LocalVocab&) = delete;

// Make a logical copy. The clone will have an empty primary set so it can
// safely be modified. The contents are copied as shared pointers to const, so
// the function runs in linear time in the number of word sets.
// Make a logical copy, where all sets of `LocalVocabEntry`s become "other"
// sets, that is, they cannot be modified by the copy. The primary set becomes
// empty. This only copies shared pointers and takes time linear in the number
// of sets.
LocalVocab clone() const;

// Moving a local vocabulary is not problematic (though the typical use case
// in our code is to copy shared pointers to local vocabularies).
// in our code is to copy shared pointers from one `LocalVocab` to another).
LocalVocab(LocalVocab&&) = default;
LocalVocab& operator=(LocalVocab&&) = default;

// Get the index of a word in the local vocabulary. If the word was already
// contained, return the already existing index. If the word was not yet
// contained, add it, and return the new index.
LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word);
LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word);
// For a given `LocalVocabEntry`, return the corresponding `LocalVocabIndex`
// (which is just the address of the `LocalVocabEntry`). If the
// `LocalVocabEntry` is not contained in any of the sets, add it to the
// primary.
LocalVocabIndex getIndexAndAddIfNotContained(const LocalVocabEntry& word);
LocalVocabIndex getIndexAndAddIfNotContained(LocalVocabEntry&& word);

// Get the index of a word in the local vocabulary, or std::nullopt if it is
// not contained. This is useful for testing.
// Like `getIndexAndAddIfNotContained`, but if the `LocalVocabEntry` is not
// contained in any of the sets, do not add it and return `std::nullopt`.
std::optional<LocalVocabIndex> getIndexOrNullopt(
const LiteralOrIri& word) const;
const LocalVocabEntry& word) const;

// The number of words in the vocabulary.
// Note: This is not constant time, but linear in the number of word sets.
// The number of words in this local vocabulary.
size_t size() const {
auto result = primaryWordSet().size();
for (const auto& previous : otherWordSets_) {
result += previous->size();
if constexpr (ad_utility::areExpensiveChecksEnabled) {
auto size = primaryWordSet().size();
for (const auto& previous : otherWordSets_) {
size += previous->size();
}
AD_CORRECTNESS_CHECK(size == size_);
}
return result;
return size_;
}

// Return true if and only if the local vocabulary is empty.
bool empty() const { return size() == 0; }

// Return a const reference to the word.
const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const;
// Get the `LocalVocabEntry` corresponding to the given `LocalVocabIndex`.
//
// NOTE: This used to be a more complex function but is now a simple
// dereference. It could be thrown out in the future.
const LocalVocabEntry& getWord(LocalVocabIndex localVocabIndex) const;

// Create a local vocab that contains and keeps alive all the words from each
// of the `vocabs`. The primary word set of the newly created vocab is empty.
static LocalVocab merge(std::span<const LocalVocab*> vocabs);

// Merge all passed local vocabs to keep alive all the words from each of the
// `vocabs`.
// Add all sets (primary and other) of the given local vocabs as other sets
// to this local vocab. The purpose is to keep all the contained
// `LocalVocabEntry`s alive as long as this `LocalVocab` is alive. The
// primary set of this `LocalVocab` remains unchanged.
template <std::ranges::range R>
void mergeWith(const R& vocabs) {
auto inserter = std::back_inserter(otherWordSets_);
for (const auto& vocab : vocabs) {
using std::views::filter;
for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) {
std::ranges::copy(vocab.otherWordSets_, inserter);
*inserter = vocab.primaryWordSet_;
size_ += vocab.size_;
}

// Also merge the `vocabs` `LocalBlankNodeManager`s, if they exist.
using LocalBlankNodeManager =
ad_utility::BlankNodeManager::LocalBlankNodeManager;
auto localManagersView =
vocabs |
std::views::transform([](const LocalVocab& vocab) -> const auto& {
return vocab.localBlankNodeManager_;
});

auto it = std::ranges::find_if(localManagersView,
[](const auto& l) { return l != nullptr; });
if (it == localManagersView.end()) {
return;
}
if (!localBlankNodeManager_) {
localBlankNodeManager_ =
std::make_shared<LocalBlankNodeManager>((*it)->blankNodeManager());
}
localBlankNodeManager_->mergeWith(localManagersView);
}

// Return all the words from all the word sets as a vector.
std::vector<LiteralOrIri> getAllWordsForTesting() const;
// Create a new local vocab with empty set and other sets that are the union
// of all sets (primary and other) of the given local vocabs.
static LocalVocab merge(std::span<const LocalVocab*> vocabs);

// Return all the words from all the word sets as a vector. This is useful
// for testing.
std::vector<LocalVocabEntry> getAllWordsForTesting() const;

// Get a new BlankNodeIndex using the LocalBlankNodeManager.
[[nodiscard]] BlankNodeIndex getBlankNodeIndex(
Expand All @@ -114,8 +157,12 @@ class LocalVocab {
bool isBlankNodeIndexContained(BlankNodeIndex blankNodeIndex) const;

private:
// Common implementation for the two variants of
// `getIndexAndAddIfNotContainedImpl` above.
// Accessors for the primary set.
Set& primaryWordSet() { return *primaryWordSet_; }
const Set& primaryWordSet() const { return *primaryWordSet_; }

// Common implementation for the two methods `getIndexAndAddIfNotContained`
// and `getIndexOrNullopt` above.
template <typename WordT>
LocalVocabIndex getIndexAndAddIfNotContainedImpl(WordT&& word);
};
6 changes: 2 additions & 4 deletions src/engine/TransitivePathBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl(
ad_utility::Timer timer{ad_utility::Timer::Stopped};
size_t outputRow = 0;
IdTableStatic<OUTPUT_WIDTH> table{getResultWidth(), allocator()};
std::vector<LocalVocab> storedLocalVocabs;
LocalVocab mergedVocab{};
for (auto& [node, linkedNodes, localVocab, idTable, inputRow] : hull) {
timer.cont();
// As an optimization nodes without any linked nodes should not get yielded
Expand All @@ -120,7 +120,7 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl(
}

if (yieldOnce) {
storedLocalVocabs.emplace_back(std::move(localVocab));
mergedVocab.mergeWith(std::span{&localVocab, 1});
} else {
timer.stop();
runtimeInfo().addDetail("IdTable fill time", timer.msecs());
Expand All @@ -132,8 +132,6 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl(
}
if (yieldOnce) {
timer.start();
LocalVocab mergedVocab{};
mergedVocab.mergeWith(storedLocalVocabs);
runtimeInfo().addDetail("IdTable fill time", timer.msecs());
co_yield {std::move(table).toDynamic(), std::move(mergedVocab)};
}
Expand Down
16 changes: 13 additions & 3 deletions src/engine/TransitivePathImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class TransitivePathImpl : public TransitivePathBase {
transitiveHull(edges, sub->getCopyOfLocalVocab(), std::move(nodes),
targetSide.isVariable()
? std::nullopt
: std::optional{std::get<Id>(targetSide.value_)});
: std::optional{std::get<Id>(targetSide.value_)},
yieldOnce);

auto result = fillTableWithHull(
std::move(hull), startSide.outputCol_, targetSide.outputCol_,
Expand Down Expand Up @@ -131,7 +132,8 @@ class TransitivePathImpl : public TransitivePathBase {
edges, sub->getCopyOfLocalVocab(), std::span{&tableInfo, 1},
targetSide.isVariable()
? std::nullopt
: std::optional{std::get<Id>(targetSide.value_)});
: std::optional{std::get<Id>(targetSide.value_)},
yieldOnce);

auto result = fillTableWithHull(std::move(hull), startSide.outputCol_,
targetSide.outputCol_, yieldOnce);
Expand Down Expand Up @@ -240,11 +242,15 @@ class TransitivePathImpl : public TransitivePathBase {
* `TableColumnWithVocab` that can be consumed to create a transitive hull.
* @param target Optional target Id. If supplied, only paths which end
* in this Id are added to the hull.
* @param yieldOnce This has to be set to the same value as the consuming
* code. When set to true, this will prevent yielding the same LocalVocab over
* and over again to make merging faster (because merging with an empty
* LocalVocab is a no-op).
* @return Map Maps each Id to its connected Ids in the transitive hull
*/
NodeGenerator transitiveHull(const T& edges, LocalVocab edgesVocab,
std::ranges::range auto startNodes,
std::optional<Id> target) const {
std::optional<Id> target, bool yieldOnce) const {
ad_utility::Timer timer{ad_utility::Timer::Stopped};
for (auto&& tableColumn : startNodes) {
timer.cont();
Expand All @@ -260,6 +266,10 @@ class TransitivePathImpl : public TransitivePathBase {
mergedVocab.clone(), tableColumn.table_,
currentRow};
timer.cont();
// Reset vocab to prevent merging the same vocab over and over again.
if (yieldOnce) {
mergedVocab = LocalVocab{};
}
}
currentRow++;
}
Expand Down
2 changes: 1 addition & 1 deletion src/index/LocalVocabEntry.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class alignas(16) LocalVocabEntry
// the first *larger* word in the vocabulary. Note: we store the cache as
// three separate atomics to avoid mutexes. The downside is, that in parallel
// code multiple threads might look up the position concurrently, which wastes
// a bit of resources. We however don't consider this case to be likely.
// a bit of resources. However, we don't consider this case to be likely.
mutable ad_utility::CopyableAtomic<VocabIndex> lowerBoundInVocab_;
mutable ad_utility::CopyableAtomic<VocabIndex> upperBoundInVocab_;
mutable ad_utility::CopyableAtomic<bool> positionInVocabKnown_ = false;
Expand Down
Loading

0 comments on commit e1bb1f5

Please sign in to comment.