diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index 1dbc8e7c68..2170d20d5d 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -1,37 +1,35 @@ -// Copyright 2022, University of Freiburg +// Copyright 2022 - 2024, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Hannah Bast +// Authors: Hannah Bast +// Johannes Kalmbach #include "engine/LocalVocab.h" #include "absl/strings/str_cat.h" #include "global/Id.h" #include "global/ValueId.h" +#include "util/TransparentFunctors.h" // _____________________________________________________________________________ LocalVocab LocalVocab::clone() const { - LocalVocab localVocabClone; - localVocabClone.otherWordSets_ = otherWordSets_; - localVocabClone.otherWordSets_.push_back(primaryWordSet_); - // Return the clone. - return localVocabClone; + LocalVocab result; + result.mergeWith(std::span{this, 1}); + AD_CORRECTNESS_CHECK(result.size_ == size_); + return result; } // _____________________________________________________________________________ LocalVocab LocalVocab::merge(std::span vocabs) { - LocalVocab res; - res.mergeWith(vocabs | - std::views::transform( - [](const LocalVocab* localVocab) -> const LocalVocab& { - return *localVocab; - })); - return res; + LocalVocab result; + result.mergeWith(vocabs | std::views::transform(ad_utility::dereference)); + return result; } // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word)); + size_ += static_cast(isNewWord); // TODO Use std::to_address (more idiomatic, but currently breaks // the MacOS build. return &(*wordIterator); @@ -39,18 +37,19 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { // _____________________________________________________________________________ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( - const LiteralOrIri& word) { + const LocalVocabEntry& word) { return getIndexAndAddIfNotContainedImpl(word); } // _____________________________________________________________________________ -LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) { +LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained( + LocalVocabEntry&& word) { return getIndexAndAddIfNotContainedImpl(std::move(word)); } // _____________________________________________________________________________ std::optional LocalVocab::getIndexOrNullopt( - const LiteralOrIri& word) const { + const LocalVocabEntry& word) const { auto localVocabIndex = primaryWordSet().find(word); if (localVocabIndex != primaryWordSet().end()) { // TODO Use std::to_address (more idiomatic, but currently breaks @@ -62,15 +61,14 @@ std::optional LocalVocab::getIndexOrNullopt( } // _____________________________________________________________________________ -const LocalVocab::LiteralOrIri& LocalVocab::getWord( +const LocalVocabEntry& LocalVocab::getWord( LocalVocabIndex localVocabIndex) const { return *localVocabIndex; } // _____________________________________________________________________________ -std::vector LocalVocab::getAllWordsForTesting() - const { - std::vector result; +std::vector LocalVocab::getAllWordsForTesting() const { + std::vector result; std::ranges::copy(primaryWordSet(), std::back_inserter(result)); for (const auto& previous : otherWordSets_) { std::ranges::copy(*previous, std::back_inserter(result)); @@ -84,7 +82,9 @@ BlankNodeIndex LocalVocab::getBlankNodeIndex( AD_CONTRACT_CHECK(blankNodeManager); // Initialize the `localBlankNodeManager_` if it doesn't exist yet. if (!localBlankNodeManager_) [[unlikely]] { - localBlankNodeManager_.emplace(blankNodeManager); + localBlankNodeManager_ = + std::make_shared( + blankNodeManager); } return BlankNodeIndex::make(localBlankNodeManager_->getId()); } diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 3055c400a6..f61982400d 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -1,11 +1,14 @@ -// Copyright 2022, University of Freiburg +// Copyright 2022 - 2024, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Hannah Bast +// Authors: Hannah Bast +// Johannes Kalmbach #pragma once +#include #include #include +#include #include #include #include @@ -14,32 +17,40 @@ #include "global/Id.h" #include "parser/LiteralOrIri.h" #include "util/BlankNodeManager.h" +#include "util/Exception.h" -// A class for maintaining a local vocabulary with contiguous (local) IDs. This -// is meant for words that are not part of the normal vocabulary (constructed -// from the input data at indexing time). +// A class for maintaining a local vocabulary, which conceptually is a set of +// `LiteralOrIri`s that are not part of the original vocabulary (which stems +// from the input data). The implementation is subtle and quite clever: // - +// The entries of the local vocabulary are `LocalVocabEntry`s, each of which +// holds a `LiteralOrIri` and remembers its position in the original vocabulary +// after it has been computed once. +// +// A `LocalVocab` has a primary set of `LocalVocabEntry`s, which can grow +// dynamically, and a collection of other sets of `LocalVocabEntry`s, which +// cannot be modified by this class. A `LocalVocabEntry` lives exactly as long +// as it is contained in at least one of the (primary or other) sets of a +// `LocalVocab`. class LocalVocab { private: - using Entry = LocalVocabEntry; - using LiteralOrIri = LocalVocabEntry; - // A map of the words in the local vocabulary to their local IDs. This is a - // node hash map because we need the addresses of the words (which are of type - // `LiteralOrIri`) to remain stable over their lifetime in the hash map - // because we hand out pointers to them. - using Set = absl::node_hash_set; + // The primary set of `LocalVocabEntry`s, which can grow dynamically. + // + // NOTE: This is a `absl::node_hash_set` because we hand out pointers to + // the `LocalVocabEntry`s and it is hence essential that their addresses + // remain stable over their lifetime in the hash set. + using Set = absl::node_hash_set; std::shared_ptr primaryWordSet_ = std::make_shared(); - // Local vocabularies from child operations that were merged into this - // vocabulary s.t. the pointers are kept alive. They have to be `const` - // because they are possibly shared concurrently (for example via the cache). + // The other sets of `LocalVocabEntry`s, which are static. std::vector> otherWordSets_; - auto& primaryWordSet() { return *primaryWordSet_; } - const auto& primaryWordSet() const { return *primaryWordSet_; } + // The number of words (so that we can compute `size()` in constant time). + size_t size_ = 0; - std::optional + // Each `LocalVocab` has its own `LocalBlankNodeManager` to generate blank + // nodes when needed (e.g., when parsing the result of a SERVICE query). + std::shared_ptr localBlankNodeManager_; public: @@ -50,60 +61,92 @@ class LocalVocab { LocalVocab(const LocalVocab&) = delete; LocalVocab& operator=(const LocalVocab&) = delete; - // Make a logical copy. The clone will have an empty primary set so it can - // safely be modified. The contents are copied as shared pointers to const, so - // the function runs in linear time in the number of word sets. + // Make a logical copy, where all sets of `LocalVocabEntry`s become "other" + // sets, that is, they cannot be modified by the copy. The primary set becomes + // empty. This only copies shared pointers and takes time linear in the number + // of sets. LocalVocab clone() const; // Moving a local vocabulary is not problematic (though the typical use case - // in our code is to copy shared pointers to local vocabularies). + // in our code is to copy shared pointers from one `LocalVocab` to another). LocalVocab(LocalVocab&&) = default; LocalVocab& operator=(LocalVocab&&) = default; - // Get the index of a word in the local vocabulary. If the word was already - // contained, return the already existing index. If the word was not yet - // contained, add it, and return the new index. - LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word); - LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word); + // For a given `LocalVocabEntry`, return the corresponding `LocalVocabIndex` + // (which is just the address of the `LocalVocabEntry`). If the + // `LocalVocabEntry` is not contained in any of the sets, add it to the + // primary. + LocalVocabIndex getIndexAndAddIfNotContained(const LocalVocabEntry& word); + LocalVocabIndex getIndexAndAddIfNotContained(LocalVocabEntry&& word); - // Get the index of a word in the local vocabulary, or std::nullopt if it is - // not contained. This is useful for testing. + // Like `getIndexAndAddIfNotContained`, but if the `LocalVocabEntry` is not + // contained in any of the sets, do not add it and return `std::nullopt`. std::optional getIndexOrNullopt( - const LiteralOrIri& word) const; + const LocalVocabEntry& word) const; - // The number of words in the vocabulary. - // Note: This is not constant time, but linear in the number of word sets. + // The number of words in this local vocabulary. size_t size() const { - auto result = primaryWordSet().size(); - for (const auto& previous : otherWordSets_) { - result += previous->size(); + if constexpr (ad_utility::areExpensiveChecksEnabled) { + auto size = primaryWordSet().size(); + for (const auto& previous : otherWordSets_) { + size += previous->size(); + } + AD_CORRECTNESS_CHECK(size == size_); } - return result; + return size_; } // Return true if and only if the local vocabulary is empty. bool empty() const { return size() == 0; } - // Return a const reference to the word. - const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const; + // Get the `LocalVocabEntry` corresponding to the given `LocalVocabIndex`. + // + // NOTE: This used to be a more complex function but is now a simple + // dereference. It could be thrown out in the future. + const LocalVocabEntry& getWord(LocalVocabIndex localVocabIndex) const; - // Create a local vocab that contains and keeps alive all the words from each - // of the `vocabs`. The primary word set of the newly created vocab is empty. - static LocalVocab merge(std::span vocabs); - - // Merge all passed local vocabs to keep alive all the words from each of the - // `vocabs`. + // Add all sets (primary and other) of the given local vocabs as other sets + // to this local vocab. The purpose is to keep all the contained + // `LocalVocabEntry`s alive as long as this `LocalVocab` is alive. The + // primary set of this `LocalVocab` remains unchanged. template void mergeWith(const R& vocabs) { auto inserter = std::back_inserter(otherWordSets_); - for (const auto& vocab : vocabs) { + using std::views::filter; + for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) { std::ranges::copy(vocab.otherWordSets_, inserter); *inserter = vocab.primaryWordSet_; + size_ += vocab.size_; + } + + // Also merge the `vocabs` `LocalBlankNodeManager`s, if they exist. + using LocalBlankNodeManager = + ad_utility::BlankNodeManager::LocalBlankNodeManager; + auto localManagersView = + vocabs | + std::views::transform([](const LocalVocab& vocab) -> const auto& { + return vocab.localBlankNodeManager_; + }); + + auto it = std::ranges::find_if(localManagersView, + [](const auto& l) { return l != nullptr; }); + if (it == localManagersView.end()) { + return; + } + if (!localBlankNodeManager_) { + localBlankNodeManager_ = + std::make_shared((*it)->blankNodeManager()); } + localBlankNodeManager_->mergeWith(localManagersView); } - // Return all the words from all the word sets as a vector. - std::vector getAllWordsForTesting() const; + // Create a new local vocab with empty set and other sets that are the union + // of all sets (primary and other) of the given local vocabs. + static LocalVocab merge(std::span vocabs); + + // Return all the words from all the word sets as a vector. This is useful + // for testing. + std::vector getAllWordsForTesting() const; // Get a new BlankNodeIndex using the LocalBlankNodeManager. [[nodiscard]] BlankNodeIndex getBlankNodeIndex( @@ -114,8 +157,12 @@ class LocalVocab { bool isBlankNodeIndexContained(BlankNodeIndex blankNodeIndex) const; private: - // Common implementation for the two variants of - // `getIndexAndAddIfNotContainedImpl` above. + // Accessors for the primary set. + Set& primaryWordSet() { return *primaryWordSet_; } + const Set& primaryWordSet() const { return *primaryWordSet_; } + + // Common implementation for the two methods `getIndexAndAddIfNotContained` + // and `getIndexOrNullopt` above. template LocalVocabIndex getIndexAndAddIfNotContainedImpl(WordT&& word); }; diff --git a/src/engine/TransitivePathBase.cpp b/src/engine/TransitivePathBase.cpp index a833bfdfbd..1db8a7eb0d 100644 --- a/src/engine/TransitivePathBase.cpp +++ b/src/engine/TransitivePathBase.cpp @@ -93,7 +93,7 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl( ad_utility::Timer timer{ad_utility::Timer::Stopped}; size_t outputRow = 0; IdTableStatic table{getResultWidth(), allocator()}; - std::vector storedLocalVocabs; + LocalVocab mergedVocab{}; for (auto& [node, linkedNodes, localVocab, idTable, inputRow] : hull) { timer.cont(); // As an optimization nodes without any linked nodes should not get yielded @@ -120,7 +120,7 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl( } if (yieldOnce) { - storedLocalVocabs.emplace_back(std::move(localVocab)); + mergedVocab.mergeWith(std::span{&localVocab, 1}); } else { timer.stop(); runtimeInfo().addDetail("IdTable fill time", timer.msecs()); @@ -132,8 +132,6 @@ Result::Generator TransitivePathBase::fillTableWithHullImpl( } if (yieldOnce) { timer.start(); - LocalVocab mergedVocab{}; - mergedVocab.mergeWith(storedLocalVocabs); runtimeInfo().addDetail("IdTable fill time", timer.msecs()); co_yield {std::move(table).toDynamic(), std::move(mergedVocab)}; } diff --git a/src/engine/TransitivePathImpl.h b/src/engine/TransitivePathImpl.h index 407b63a298..3e15141114 100644 --- a/src/engine/TransitivePathImpl.h +++ b/src/engine/TransitivePathImpl.h @@ -82,7 +82,8 @@ class TransitivePathImpl : public TransitivePathBase { transitiveHull(edges, sub->getCopyOfLocalVocab(), std::move(nodes), targetSide.isVariable() ? std::nullopt - : std::optional{std::get(targetSide.value_)}); + : std::optional{std::get(targetSide.value_)}, + yieldOnce); auto result = fillTableWithHull( std::move(hull), startSide.outputCol_, targetSide.outputCol_, @@ -131,7 +132,8 @@ class TransitivePathImpl : public TransitivePathBase { edges, sub->getCopyOfLocalVocab(), std::span{&tableInfo, 1}, targetSide.isVariable() ? std::nullopt - : std::optional{std::get(targetSide.value_)}); + : std::optional{std::get(targetSide.value_)}, + yieldOnce); auto result = fillTableWithHull(std::move(hull), startSide.outputCol_, targetSide.outputCol_, yieldOnce); @@ -240,11 +242,15 @@ class TransitivePathImpl : public TransitivePathBase { * `TableColumnWithVocab` that can be consumed to create a transitive hull. * @param target Optional target Id. If supplied, only paths which end * in this Id are added to the hull. + * @param yieldOnce This has to be set to the same value as the consuming + * code. When set to true, this will prevent yielding the same LocalVocab over + * and over again to make merging faster (because merging with an empty + * LocalVocab is a no-op). * @return Map Maps each Id to its connected Ids in the transitive hull */ NodeGenerator transitiveHull(const T& edges, LocalVocab edgesVocab, std::ranges::range auto startNodes, - std::optional target) const { + std::optional target, bool yieldOnce) const { ad_utility::Timer timer{ad_utility::Timer::Stopped}; for (auto&& tableColumn : startNodes) { timer.cont(); @@ -260,6 +266,10 @@ class TransitivePathImpl : public TransitivePathBase { mergedVocab.clone(), tableColumn.table_, currentRow}; timer.cont(); + // Reset vocab to prevent merging the same vocab over and over again. + if (yieldOnce) { + mergedVocab = LocalVocab{}; + } } currentRow++; } diff --git a/src/index/LocalVocabEntry.h b/src/index/LocalVocabEntry.h index 545d8a8350..e591ad64ff 100644 --- a/src/index/LocalVocabEntry.h +++ b/src/index/LocalVocabEntry.h @@ -27,7 +27,7 @@ class alignas(16) LocalVocabEntry // the first *larger* word in the vocabulary. Note: we store the cache as // three separate atomics to avoid mutexes. The downside is, that in parallel // code multiple threads might look up the position concurrently, which wastes - // a bit of resources. We however don't consider this case to be likely. + // a bit of resources. However, we don't consider this case to be likely. mutable ad_utility::CopyableAtomic lowerBoundInVocab_; mutable ad_utility::CopyableAtomic upperBoundInVocab_; mutable ad_utility::CopyableAtomic positionInVocabKnown_ = false; diff --git a/src/util/BlankNodeManager.cpp b/src/util/BlankNodeManager.cpp index 6b367118d5..44295b3aeb 100644 --- a/src/util/BlankNodeManager.cpp +++ b/src/util/BlankNodeManager.cpp @@ -4,6 +4,8 @@ #include "util/BlankNodeManager.h" +#include "util/Exception.h" + namespace ad_utility { // _____________________________________________________________________________ @@ -41,30 +43,28 @@ BlankNodeManager::LocalBlankNodeManager::LocalBlankNodeManager( BlankNodeManager* blankNodeManager) : blankNodeManager_(blankNodeManager) {} -// _____________________________________________________________________________ -BlankNodeManager::LocalBlankNodeManager::~LocalBlankNodeManager() { - auto ptr = blankNodeManager_->usedBlocksSet_.wlock(); - for (const auto& block : blocks_) { - AD_CONTRACT_CHECK(ptr->contains(block.blockIdx_)); - ptr->erase(block.blockIdx_); - } -} - // _____________________________________________________________________________ uint64_t BlankNodeManager::LocalBlankNodeManager::getId() { - if (blocks_.empty() || blocks_.back().nextIdx_ == idxAfterCurrentBlock_) { - blocks_.emplace_back(blankNodeManager_->allocateBlock()); - idxAfterCurrentBlock_ = blocks_.back().nextIdx_ + blockSize_; + if (blocks_->empty() || blocks_->back().nextIdx_ == idxAfterCurrentBlock_) { + blocks_->emplace_back(blankNodeManager_->allocateBlock()); + idxAfterCurrentBlock_ = blocks_->back().nextIdx_ + blockSize_; } - return blocks_.back().nextIdx_++; + return blocks_->back().nextIdx_++; } // _____________________________________________________________________________ bool BlankNodeManager::LocalBlankNodeManager::containsBlankNodeIndex( uint64_t index) const { - return std::ranges::any_of(blocks_, [index](const Block& block) { + auto containsIndex = [index](const Block& block) { return index >= block.startIdx_ && index < block.nextIdx_; - }); + }; + + return std::ranges::any_of(*blocks_, containsIndex) || + std::ranges::any_of( + otherBlocks_, + [&](const std::shared_ptr>& blocks) { + return std::ranges::any_of(*blocks, containsIndex); + }); } } // namespace ad_utility diff --git a/src/util/BlankNodeManager.h b/src/util/BlankNodeManager.h index 7fd5416294..afdc748281 100644 --- a/src/util/BlankNodeManager.h +++ b/src/util/BlankNodeManager.h @@ -58,6 +58,7 @@ class BlankNodeManager { public: ~Block() = default; + // The index of this block. const uint64_t blockIdx_; @@ -71,7 +72,7 @@ class BlankNodeManager { class LocalBlankNodeManager { public: explicit LocalBlankNodeManager(BlankNodeManager* blankNodeManager); - ~LocalBlankNodeManager(); + ~LocalBlankNodeManager() = default; // No copy, as the managed blocks should not be duplicated. LocalBlankNodeManager(const LocalBlankNodeManager&) = delete; @@ -87,16 +88,46 @@ class BlankNodeManager { // Return true iff the `index` was returned by a previous call to `getId()`. bool containsBlankNodeIndex(uint64_t index) const; - private: - // Reserved blocks. - std::vector blocks_; + // Merge passed `LocalBlankNodeManager`s to keep alive their reserved + // BlankNodeIndex blocks. + template + void mergeWith(const R& localBlankNodeManagers) { + auto inserter = std::back_inserter(otherBlocks_); + for (const auto& l : localBlankNodeManagers) { + if (l == nullptr) { + continue; + } + std::ranges::copy(l->otherBlocks_, inserter); + *inserter = l->blocks_; + } + } + + // Getter for the `blankNodeManager_` pointer required in + // `LocalVocab::mergeWith`. + BlankNodeManager* blankNodeManager() const { return blankNodeManager_; } + private: // Reference to the BlankNodeManager, used to free the reserved blocks. BlankNodeManager* blankNodeManager_; + // Reserved blocks. + using Blocks = std::vector; + std::shared_ptr blocks_{ + new Blocks(), [blankNodeManager = blankNodeManager()](auto blocksPtr) { + auto ptr = blankNodeManager->usedBlocksSet_.wlock(); + for (const auto& block : *blocksPtr) { + AD_CONTRACT_CHECK(ptr->contains(block.blockIdx_)); + ptr->erase(block.blockIdx_); + } + delete blocksPtr; + }}; + // The first index after the current Block. uint64_t idxAfterCurrentBlock_{0}; + // Blocks merged from other `LocalBlankNodeManager`s. + std::vector> otherBlocks_; + FRIEND_TEST(BlankNodeManager, LocalBlankNodeManagerGetID); }; diff --git a/src/util/JoinAlgorithms/JoinAlgorithms.h b/src/util/JoinAlgorithms/JoinAlgorithms.h index 13d52aa2d8..64c27d74c9 100644 --- a/src/util/JoinAlgorithms/JoinAlgorithms.h +++ b/src/util/JoinAlgorithms/JoinAlgorithms.h @@ -156,7 +156,7 @@ template && isSimilar) { return true; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_convertible_v) { return row != Id::makeUndefined(); } else { return (std::ranges::none_of( @@ -664,6 +664,7 @@ struct JoinSide { [[no_unique_address]] const End end_; const Projection& projection_; CurrentBlocks currentBlocks_{}; + CurrentBlocks undefBlocks_{}; // Type aliases for a single element from a block from the left/right input. using value_type = std::ranges::range_value_t>; @@ -689,6 +690,10 @@ auto makeJoinSide(Blocks& blocks, const auto& projection) { template concept IsJoinSide = ad_utility::isInstantiation; +struct AlwaysFalse { + bool operator()(const auto&) const { return false; } +}; + // The class that actually performs the zipper join for blocks without UNDEF. // See the public `zipperJoinForBlocksWithoutUndef` function below for details. // The general approach of the algorithm is described in the following. Several @@ -736,7 +741,10 @@ concept IsJoinSide = ad_utility::isInstantiation; // `currentEl` (5 in this example). New blocks are added to one of the buffers // if they become empty at one point in the algorithm. template + typename CompatibleRowAction, + ad_utility::InvocableWithExactReturnType< + bool, typename LeftSide::ProjectedEl> + IsUndef = AlwaysFalse> struct BlockZipperJoinImpl { // The left and right inputs of the join LeftSide leftSide_; @@ -745,11 +753,14 @@ struct BlockZipperJoinImpl { const LessThan& lessThan_; // The callback that is called for each pair of matching rows. CompatibleRowAction& compatibleRowAction_; + [[no_unique_address]] IsUndef isUndefined_{}; // Type alias for the result of the projection. Elements from the left and // right input must be projected to the same type. using ProjectedEl = LeftSide::ProjectedEl; static_assert(std::same_as); + static constexpr bool potentiallyHasUndef = + !std::is_same_v; // Create an equality comparison from the `lessThan` predicate. bool eq(const auto& el1, const auto& el2) { @@ -850,12 +861,38 @@ struct BlockZipperJoinImpl { return std::tuple{std::ref(first.fullBlock()), first.subrange(), it}; } - // Call `compatibleRowAction` for all pairs of elements in the Cartesian - // product of the blocks in `blocksLeft` and `blocksRight`. + // Check if a side contains undefined values. + static bool hasUndef(const auto& side) { + if constexpr (potentiallyHasUndef) { + return !side.undefBlocks_.empty(); + } + return false; + } + + // Combine all elements from all blocks on the left with all elements from all + // blocks on the right and add them to the result. + void addCartesianProduct(const auto& blocksLeft, const auto& blocksRight) { + // TODO use `std::views::cartesian_product`. + for (const auto& lBlock : blocksLeft) { + for (const auto& rBlock : blocksRight) { + compatibleRowAction_.setInput(lBlock.fullBlock(), rBlock.fullBlock()); + for (size_t i : lBlock.getIndexRange()) { + for (size_t j : rBlock.getIndexRange()) { + compatibleRowAction_.addRow(i, j); + } + } + } + } + } + + // Handle non-matching rows from the left side for an optional join or a minus + // join. template - void addAll(const auto& blocksLeft, const auto& blocksRight) { + void addNonMatchingRowsFromLeftForOptionalJoin(const auto& blocksLeft, + const auto& blocksRight) { if constexpr (DoOptionalJoin) { - if (std::ranges::all_of( + if (!hasUndef(rightSide_) && + std::ranges::all_of( blocksRight | std::views::transform( [](const auto& inp) { return inp.subrange(); }), std::ranges::empty)) { @@ -868,17 +905,15 @@ struct BlockZipperJoinImpl { } } } - // TODO use `std::views::cartesian_product`. - for (const auto& lBlock : blocksLeft) { - for (const auto& rBlock : blocksRight) { - compatibleRowAction_.setInput(lBlock.fullBlock(), rBlock.fullBlock()); - for (size_t i : lBlock.getIndexRange()) { - for (size_t j : rBlock.getIndexRange()) { - compatibleRowAction_.addRow(i, j); - } - } - } - } + } + + // Call `compatibleRowAction` for all pairs of elements in the Cartesian + // product of the blocks in `blocksLeft` and `blocksRight`. + template + void addAll(const auto& blocksLeft, const auto& blocksRight) { + addNonMatchingRowsFromLeftForOptionalJoin(blocksLeft, + blocksRight); + addCartesianProduct(blocksLeft, blocksRight); compatibleRowAction_.flush(); } @@ -897,6 +932,59 @@ struct BlockZipperJoinImpl { return result; } + // Main implementation for `findUndefValues`. + template + cppcoro::generator findUndefValuesHelper(const auto& fullBlockLeft, + const auto& fullBlockRight, + T& begL, T& begR, + const auto& undefBlocks) { + for (const auto& undefBlock : undefBlocks) { + // Select proper input table from the stored undef blocks + if constexpr (left) { + begL = undefBlock.fullBlock().begin(); + compatibleRowAction_.setInput(undefBlock.fullBlock(), + fullBlockRight.get()); + } else { + begR = undefBlock.fullBlock().begin(); + compatibleRowAction_.setInput(fullBlockLeft.get(), + undefBlock.fullBlock()); + } + const auto& subrange = undefBlock.subrange(); + // Yield all iterators to the elements within the stored undef blocks. + for (auto subIt = subrange.begin(); subIt < subrange.end(); ++subIt) { + co_yield subIt; + } + } + // Reset back to original input + compatibleRowAction_.setInput(fullBlockLeft.get(), fullBlockRight.get()); + // No need for further iteration because we know we won't encounter any new + // undefined values at this point. + } + + // Create a generator that yields iterators to all undefined values that + // have been found so far. Note that because of limitations of the + // `zipperJoinWithUndef` interface we need to set `begL` and `begR` to the + // beginning of the full blocks of the left and right side respectively so + // that they can be used within `zipperJoinWithUndef` to compute the + // distance from the yielded iterator to the beginning of the block. + template + auto findUndefValues(const auto& fullBlockLeft, const auto& fullBlockRight, + T& begL, T& begR) { + return [this, &fullBlockLeft, &fullBlockRight, &begL, &begR]( + const auto&, auto, auto, bool) { + auto currentSide = [this]() { + if constexpr (left) { + return std::cref(leftSide_); + } else { + return std::cref(rightSide_); + } + }(); + return findUndefValuesHelper(fullBlockLeft, fullBlockRight, begL, + begR, + currentSide.get().undefBlocks_); + }; + } + // Join the first block in `currentBlocksLeft` with the first block in // `currentBlocksRight`, but ignore all elements that are `>= currentEl` // The fully joined parts of the block are then removed from @@ -912,15 +1000,16 @@ struct BlockZipperJoinImpl { getFirstBlock(currentBlocksRight, currentEl); compatibleRowAction_.setInput(fullBlockLeft.get(), fullBlockRight.get()); - auto addRowIndex = [begL = fullBlockLeft.get().begin(), - begR = fullBlockRight.get().begin(), - this](auto itFromL, auto itFromR) { + auto begL = fullBlockLeft.get().begin(); + auto begR = fullBlockRight.get().begin(); + auto addRowIndex = [&begL, &begR, this](auto itFromL, auto itFromR) { compatibleRowAction_.addRow(itFromL - begL, itFromR - begR); }; auto addNotFoundRowIndex = [&]() { if constexpr (DoOptionalJoin) { - return [begL = fullBlockLeft.get().begin(), this](auto itFromL) { + return [this, begL = fullBlockLeft.get().begin()](auto itFromL) { + AD_CORRECTNESS_CHECK(!hasUndef(rightSide_)); compatibleRowAction_.addOptionalRow(itFromL - begL); }; @@ -928,10 +1017,25 @@ struct BlockZipperJoinImpl { return ad_utility::noop; } }(); - [[maybe_unused]] auto res = zipperJoinWithUndef( - std::ranges::subrange{subrangeLeft.begin(), currentElItL}, - std::ranges::subrange{subrangeRight.begin(), currentElItR}, lessThan_, - addRowIndex, noop, noop, addNotFoundRowIndex); + // All undefined values should already be processed at this point. + AD_CORRECTNESS_CHECK(!isUndefined_(subrangeLeft.front())); + AD_CORRECTNESS_CHECK(!isUndefined_(subrangeRight.front())); + // If we have undefined values stored, we need to provide a generator that + // yields iterators to the individual undefined values. + if constexpr (potentiallyHasUndef) { + [[maybe_unused]] auto res = zipperJoinWithUndef( + std::ranges::subrange{subrangeLeft.begin(), currentElItL}, + std::ranges::subrange{subrangeRight.begin(), currentElItR}, lessThan_, + addRowIndex, + findUndefValues(fullBlockLeft, fullBlockRight, begL, begR), + findUndefValues(fullBlockLeft, fullBlockRight, begL, begR), + addNotFoundRowIndex); + } else { + [[maybe_unused]] auto res = zipperJoinWithUndef( + std::ranges::subrange{subrangeLeft.begin(), currentElItL}, + std::ranges::subrange{subrangeRight.begin(), currentElItR}, lessThan_, + addRowIndex, noop, noop, addNotFoundRowIndex); + } compatibleRowAction_.flush(); // Remove the joined elements. @@ -979,6 +1083,21 @@ struct BlockZipperJoinImpl { return fillEqualToCurrentElBothSides(getCurrentEl()); } + // Based on `blockStatus` add the Cartesian product of the blocks in + // `leftBlocks` and/or `rightBlocks` with their respective counterpart in + // `undefBlocks_`. + void joinWithUndefBlocks(BlockStatus blockStatus, const auto& leftBlocks, + const auto& rightBlocks) { + if (blockStatus == BlockStatus::allFilled || + blockStatus == BlockStatus::leftMissing) { + addCartesianProduct(leftBlocks, rightSide_.undefBlocks_); + } + if (blockStatus == BlockStatus::allFilled || + blockStatus == BlockStatus::rightMissing) { + addCartesianProduct(leftSide_.undefBlocks_, rightBlocks); + } + } + // Combine the above functionality and perform one round of joining. // Has to be called alternately with `fillBuffer`. template @@ -1010,9 +1129,12 @@ struct BlockZipperJoinImpl { blockStatus = BlockStatus::allFilled; } }; + // We are only guaranteed to have all relevant blocks from one side, so we // also need to pass through the remaining blocks from the other side. while (!equalToCurrentElLeft.empty() && !equalToCurrentElRight.empty()) { + joinWithUndefBlocks(blockStatus, equalToCurrentElLeft, + equalToCurrentElRight); addAll(equalToCurrentElLeft, equalToCurrentElRight); switch (blockStatus) { case BlockStatus::allFilled: @@ -1059,15 +1181,112 @@ struct BlockZipperJoinImpl { compatibleRowAction_.flush(); } + // Consume all remaining blocks from one side and add the Cartesian product of + // those blocks with the undef blocks from the other side. + // `reverse` is used to determine if the left or right side is consumed. + template + void consumeRemainingBlocks(auto& side, const auto& undefBlocks) { + while (side.it_ != side.end_) { + const auto& lBlock = *side.it_; + for (const auto& rBlock : undefBlocks) { + if constexpr (reversed) { + compatibleRowAction_.setInput(rBlock.fullBlock(), lBlock); + } else { + compatibleRowAction_.setInput(lBlock, rBlock.fullBlock()); + } + for (size_t i : ad_utility::integerRange(lBlock.size())) { + for (size_t j : rBlock.getIndexRange()) { + if constexpr (reversed) { + compatibleRowAction_.addRow(j, i); + } else { + compatibleRowAction_.addRow(i, j); + } + } + } + } + ++side.it_; + } + } + + // If one of the sides is exhausted and has no values to match the pairs + // left, we need to pair the remaining values with the undef values we have + // left. + void addRemainingUndefPairs() { + if constexpr (potentiallyHasUndef) { + addCartesianProduct(leftSide_.currentBlocks_, rightSide_.undefBlocks_); + consumeRemainingBlocks(leftSide_, rightSide_.undefBlocks_); + + addCartesianProduct(leftSide_.undefBlocks_, rightSide_.currentBlocks_); + consumeRemainingBlocks(rightSide_, leftSide_.undefBlocks_); + + compatibleRowAction_.flush(); + } + } + + // Consume the blocks until the first block is found that does contain a + // defined value. All blocks up until that point are stored in + // `side.undefBlocks_` and skipped for subsequent processing. The first block + // containing defined values is split and the defined part is stored in + // `side.currentBlocks_`. + void findFirstBlockWithoutUndef(auto& side) { + // The reference of `it` is there on purpose. + for (auto& it = side.it_; it != side.end_; ++it) { + auto& el = *it; + if (std::ranges::empty(el) || !isUndefined_(el.front())) { + return; + } + bool endIsUndefined = isUndefined_(el.back()); + side.undefBlocks_.emplace_back(std::move(el)); + if (!endIsUndefined) { + auto& lastUndefinedBlock = side.undefBlocks_.back(); + side.currentBlocks_.push_back(lastUndefinedBlock); + auto subrange = std::ranges::equal_range( + lastUndefinedBlock.subrange(), + lastUndefinedBlock.subrange().front(), lessThan_); + size_t undefCount = std::ranges::size(subrange); + lastUndefinedBlock.setSubrange(std::move(subrange)); + auto& firstDefinedBlock = side.currentBlocks_.back(); + firstDefinedBlock.setSubrange( + firstDefinedBlock.fullBlock().begin() + undefCount, + firstDefinedBlock.fullBlock().end()); + // Make sure this block is not accessed with moved-out value. + ++it; + return; + } + } + } + + // Find and process all leading undefined values from the blocks. + void fetchAndProcessUndefinedBlocks() { + if constexpr (potentiallyHasUndef) { + findFirstBlockWithoutUndef(leftSide_); + findFirstBlockWithoutUndef(rightSide_); + addCartesianProduct(leftSide_.undefBlocks_, rightSide_.undefBlocks_); + } + } + // The actual join routine that combines all the previous functions. template void runJoin() { + fetchAndProcessUndefinedBlocks(); + if (potentiallyHasUndef && !hasUndef(leftSide_) && !hasUndef(rightSide_)) { + // Run the join without UNDEF values if there are none. No need to move + // since LeftSide and RightSide are references. + BlockZipperJoinImpl{leftSide_, rightSide_, lessThan_, + compatibleRowAction_, AlwaysFalse{}} + .template runJoin(); + return; + } while (true) { BlockStatus blockStatus = fillBuffer(); if (leftSide_.currentBlocks_.empty() || rightSide_.currentBlocks_.empty()) { + addRemainingUndefPairs(); if constexpr (DoOptionalJoin) { - fillWithAllFromLeft(); + if (!hasUndef(rightSide_)) { + fillWithAllFromLeft(); + } } return; } @@ -1081,6 +1300,10 @@ template BlockZipperJoinImpl(LHS&, RHS&, const LessThan&, CompatibleRowAction&) -> BlockZipperJoinImpl; +template +BlockZipperJoinImpl(LHS&, RHS&, const LessThan&, CompatibleRowAction&, IsUndef) + -> BlockZipperJoinImpl; } // namespace detail @@ -1133,4 +1356,28 @@ void zipperJoinForBlocksWithoutUndef(LeftBlocks&& leftBlocks, impl.template runJoin(); } +// Similar to `zipperJoinForBlocksWithoutUndef`, but allows for UNDEF values in +// a single column join scenario. +template +void zipperJoinForBlocksWithPotentialUndef(LeftBlocks&& leftBlocks, + RightBlocks&& rightBlocks, + const LessThan& lessThan, + auto& compatibleRowAction, + LeftProjection leftProjection = {}, + RightProjection rightProjection = {}, + DoOptionalJoinTag = {}) { + static constexpr bool DoOptionalJoin = DoOptionalJoinTag::value; + + auto leftSide = detail::makeJoinSide(leftBlocks, leftProjection); + auto rightSide = detail::makeJoinSide(rightBlocks, rightProjection); + + detail::BlockZipperJoinImpl impl{ + leftSide, rightSide, lessThan, compatibleRowAction, + [](const Id& id) { return id.isUndefined(); }}; + impl.template runJoin(); +} + } // namespace ad_utility diff --git a/src/util/TransparentFunctors.h b/src/util/TransparentFunctors.h index e00d3b7a53..1b889a0796 100644 --- a/src/util/TransparentFunctors.h +++ b/src/util/TransparentFunctors.h @@ -1,25 +1,23 @@ -// Copyright 2022, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: -// 2022 - Johannes Kalmbach +// Copyright 2022 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach -#ifndef QLEVER_TRANSPARENTFUNCTORS_H -#define QLEVER_TRANSPARENTFUNCTORS_H +#pragma once #include #include #include -/// Contains several function object types with templated operator() that wrap -/// overloaded functions from the standard library. This enables passing them as -/// function parameters. - -/// Note that in theory all of them could be implemented shorter as captureless -/// lambda expressions. We have chosen not to do this because the STL also does -/// not choose this approach (see e.g. `std::less`, `std::plus`, etc.) and -/// because global inline lambdas in header files might in theory cause ODR (one -/// definition rule) problems, especially when using different compilers. +// Contains several function object types with templated operator() that wrap +// overloaded functions from the standard library. This enables passing them as +// function parameters. +// +// NOTE: in theory all of them could be implemented shorter as captureless +// lambda expressions. We have chosen not to do this because the STL also does +// not choose this approach (see e.g. `std::less`, `std::plus`, etc.) and +// because global inline lambdas in header files might in theory cause ODR (one +// definition rule) problems, especially when using different compilers. namespace ad_utility { @@ -79,37 +77,60 @@ struct ToBoolImpl { } }; +// Implementation of `staticCast` (see below). +template +struct StaticCastImpl { + constexpr decltype(auto) operator()(auto&& x) const { + return static_cast(AD_FWD(x)); + } +}; + +// Implementation of `dereference` (see below). +struct DereferenceImpl { + constexpr decltype(auto) operator()(auto&& x) const { return *AD_FWD(x); } +}; + } // namespace detail -/// Return the first element via perfect forwarding of any type for which -/// `std::get<0>(x)` is valid. This holds e.g. for `std::pair`, `std::tuple`, -/// and `std::array`. +// Return the first element via perfect forwarding of any type for which +// `std::get<0>(x)` is valid. This holds e.g. for `std::pair`, `std::tuple`, +// and `std::array`. static constexpr detail::FirstImpl first; -/// Return the second element via perfect forwarding of any type for which -/// `std::get<1>(x)` is valid. This holds e.g. for `std::pair`, `std::tuple`, -/// and `std::array`. +// Return the second element via perfect forwarding of any type for which +// `std::get<1>(x)` is valid. This holds e.g. for `std::pair`, `std::tuple`, +// and `std::array`. static constexpr detail::SecondImpl second; -/// Transparent functor for `std::holds_alternative` +// Transparent functor for `std::holds_alternative` template static constexpr detail::HoldsAlternativeImpl holdsAlternative; -/// Transparent functor for `std::get`. Currently only works for `std::variant` -/// and not for `std::array` or `std::tuple`. +// Transparent functor for `std::get`. Currently only works for `std::variant` +// and not for `std::array` or `std::tuple`. template static constexpr detail::GetImpl get; -/// Transparent functor for `std::get_if`. As an extension to `std::get_if`, -/// `ad_utility::getIf` may also be called with a `variant` object or reference, -/// not only with a pointer. +// Transparent functor for `std::get_if`. As an extension to `std::get_if`, +// `ad_utility::getIf` may also be called with a `variant` object or reference, +// not only with a pointer. template static constexpr detail::GetIfImpl getIf; +// Transparent functor that converts any type to `bool` via +// `static_cast`. static constexpr detail::ToBoolImpl toBool; -/// A functor that takes an arbitrary number of arguments by reference and does -/// nothing. +// Transparent functor that casts any type to `T` via `static_cast`. +template +static constexpr detail::StaticCastImpl staticCast{}; + +// Transparent functor that dereferences a pointer or smart pointer. +static constexpr detail::DereferenceImpl dereference; + +// Transparent functor that takes an arbitrary number of arguments by reference +// and does nothing. We also use the type `Noop`, hence it is defined here and +// not in the `detail` namespace above. struct Noop { void operator()(const auto&...) const { // This function deliberately does nothing (static analysis expects a @@ -119,5 +140,3 @@ struct Noop { [[maybe_unused]] static constexpr Noop noop{}; } // namespace ad_utility - -#endif // QLEVER_TRANSPARENTFUNCTORS_H diff --git a/test/BlankNodeManagerTest.cpp b/test/BlankNodeManagerTest.cpp index 70803bd3f0..9d1969c80c 100644 --- a/test/BlankNodeManagerTest.cpp +++ b/test/BlankNodeManagerTest.cpp @@ -37,24 +37,32 @@ TEST(BlankNodeManager, blockAllocationAndFree) { // _____________________________________________________________________________ TEST(BlankNodeManager, LocalBlankNodeManagerGetID) { BlankNodeManager bnm(0); - BlankNodeManager::LocalBlankNodeManager l(&bnm); + auto l = std::make_shared(&bnm); // initially the LocalBlankNodeManager doesn't have any blocks - EXPECT_EQ(l.blocks_.size(), 0); + EXPECT_EQ(l->blocks_->size(), 0); // A new Block is allocated, if // no blocks are allocated yet - uint64_t id = l.getId(); - EXPECT_EQ(l.blocks_.size(), 1); - EXPECT_TRUE(l.containsBlankNodeIndex(id)); - EXPECT_FALSE(l.containsBlankNodeIndex(id + 1)); - EXPECT_FALSE(l.containsBlankNodeIndex(id - 1)); + uint64_t id = l->getId(); + EXPECT_EQ(l->blocks_->size(), 1); + EXPECT_TRUE(l->containsBlankNodeIndex(id)); + EXPECT_FALSE(l->containsBlankNodeIndex(id + 1)); + EXPECT_FALSE(l->containsBlankNodeIndex(id - 1)); // or the ids of the last block are all used - l.blocks_.back().nextIdx_ = id + BlankNodeManager::blockSize_; - id = l.getId(); - EXPECT_TRUE(l.containsBlankNodeIndex(id)); - EXPECT_EQ(l.blocks_.size(), 2); + l->blocks_->back().nextIdx_ = id + BlankNodeManager::blockSize_; + id = l->getId(); + EXPECT_TRUE(l->containsBlankNodeIndex(id)); + EXPECT_EQ(l->blocks_->size(), 2); + + // The `LocalBlankNodeManager` still works when recursively merged. + std::vector itSelf{l}; + l->mergeWith(itSelf); + + EXPECT_TRUE(l->containsBlankNodeIndex(id)); + EXPECT_TRUE(l->containsBlankNodeIndex(l->getId())); + EXPECT_EQ(l->blocks_, l->otherBlocks_[0]); } // _____________________________________________________________________________ diff --git a/test/JoinAlgorithmsTest.cpp b/test/JoinAlgorithmsTest.cpp index f9397eaf46..1829de1519 100644 --- a/test/JoinAlgorithmsTest.cpp +++ b/test/JoinAlgorithmsTest.cpp @@ -195,3 +195,279 @@ TEST(JoinAlgorithms, JoinWithBlocksMultipleBlocksPerElementBothSides) { // the optional join stays the same. testOptionalJoin(a, b, expectedResult); } + +namespace { + +// Replacement for `Id`, but with an additional tag to distinguish between ids +// with the same value for testing. +struct FakeId { + Id value_; + std::string_view tag_; + + operator Id() const { return value_; } + auto operator==(const FakeId& other) const { + return value_.getBits() == other.value_.getBits() && tag_ == other.tag_; + } + + friend std::ostream& operator<<(std::ostream& os, const FakeId& id) { + return os << "FakeId{" << id.value_ << ", " << id.tag_ << "}"; + } +}; + +// RowAdder implementation that works with FakeIds and allows to tell undefined +// ids apart from each other. +struct RowAdderWithUndef { + const std::vector* left_ = nullptr; + const std::vector* right_ = nullptr; + std::vector> output_{}; + + void setInput(const std::vector& left, + const std::vector& right) { + left_ = &left; + right_ = &right; + } + + void setOnlyLeftInputForOptionalJoin(const std::vector& left) { + left_ = &left; + } + + void addRow(size_t leftIndex, size_t rightIndex) { + auto id1 = (*left_)[leftIndex]; + auto id2 = (*right_)[rightIndex]; + output_.push_back({id1, id2}); + } + + void addOptionalRow(size_t leftIndex) { + auto id = (*left_)[leftIndex]; + output_.push_back({id, FakeId{Id::makeUndefined(), "OPTIONAL"}}); + } + + void flush() const { + // Does nothing, but is required for the interface. + } + + const auto& getOutput() const { return output_; } +}; + +// Join both vectors `a` and `b` and assert that the result is equal to the +// given `expected` result. Joins are performed 2 times, the second time with +// `a` and `b` swapped. +void testDynamicJoinWithUndef(const std::vector>& a, + const std::vector>& b, + std::vector> expected, + source_location l = source_location::current()) { + using namespace std::placeholders; + using namespace std::ranges; + auto trace = generateLocationTrace(l); + auto compare = [](FakeId l, FakeId r) { + return static_cast(l) < static_cast(r); + }; + AD_CONTRACT_CHECK(is_sorted(a | views::join, {}, ad_utility::staticCast)); + AD_CONTRACT_CHECK(is_sorted(b | views::join, {}, ad_utility::staticCast)); + auto validationProjection = [](const std::array& fakeIds) -> Id { + const auto& [x, y] = fakeIds; + return x == Id::makeUndefined() ? y : x; + }; + { + RowAdderWithUndef adder{}; + zipperJoinForBlocksWithPotentialUndef(a, b, compare, adder); + const auto& result = adder.getOutput(); + // The result must be sorted on the first column + EXPECT_TRUE(is_sorted(result, std::less<>{}, validationProjection)); + // The exact order of the elements with the same first column is not + // important and depends on implementation details. We therefore do not + // enforce it here. + EXPECT_THAT(result, ::testing::UnorderedElementsAreArray(expected)); + } + + for (auto& [x, y] : expected) { + std::swap(x, y); + } + + { + RowAdderWithUndef adder{}; + zipperJoinForBlocksWithPotentialUndef(b, a, compare, adder); + const auto& result = adder.getOutput(); + EXPECT_TRUE(is_sorted(result, std::less<>{}, validationProjection)); + EXPECT_THAT(result, ::testing::UnorderedElementsAreArray(expected)); + } +} +using F = FakeId; +auto I = Id::makeFromInt; +} // namespace + +// ________________________________________________________________________________________ +TEST(JoinAlgorithms, JoinWithBlocksWithUndefOnOneSide) { + auto U = Id::makeUndefined(); + std::vector> a{{{U, "a0"}}, + {{I(42), "a1"}, {I(42), "a2"}}, + {{I(42), "a3"}, {I(67), "a4"}}, + {{I(67), "a5"}}, + {{I(67), "a6"}}, + {{I(67), "a7"}}, + {{I(67), "a8"}}, + {{I(67), "a9"}}, + {{I(67), "a10"}}, + {{I(68), "a11"}}, + {{I(68), "a12"}}, + {{I(68), "a13"}}, + {{I(68), "a14"}}, + {{I(68), "a15"}}, + {{I(68), "a16"}}, + {{I(68), "a17"}}}; + std::vector> b{{{I(2), "b0"}, {I(42), "b1"}}, + {{I(42), "b2"}, {I(67), "b3"}}}; + std::vector> expectedResult{ + {F{U, "a0"}, F{I(2), "b0"}}, {F{U, "a0"}, F{I(42), "b1"}}, + {F{U, "a0"}, F{I(42), "b2"}}, {F{I(42), "a1"}, F{I(42), "b1"}}, + {F{I(42), "a1"}, F{I(42), "b2"}}, {F{I(42), "a2"}, F{I(42), "b1"}}, + {F{I(42), "a2"}, F{I(42), "b2"}}, {F{I(42), "a3"}, F{I(42), "b1"}}, + {F{I(42), "a3"}, F{I(42), "b2"}}, {F{U, "a0"}, F{I(67), "b3"}}, + {F{I(67), "a4"}, F{I(67), "b3"}}, {F{I(67), "a5"}, F{I(67), "b3"}}, + {F{I(67), "a6"}, F{I(67), "b3"}}, {F{I(67), "a7"}, F{I(67), "b3"}}, + {F{I(67), "a8"}, F{I(67), "b3"}}, {F{I(67), "a9"}, F{I(67), "b3"}}, + {F{I(67), "a10"}, F{I(67), "b3"}}, + }; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// ________________________________________________________________________________________ +TEST(JoinAlgorithms, JoinWithBlocksWithUndefOnBothSides) { + auto U = Id::makeUndefined(); + std::vector> a{{{U, "a0"}}, + {{I(42), "a1"}, {I(42), "a2"}}, + {{I(42), "a3"}, {I(67), "a4"}}, + {{I(67), "a5"}}, + {{I(67), "a6"}}, + {{I(67), "a7"}}, + {{I(67), "a8"}}, + {{I(68), "a9"}}, + {{I(68), "a10"}}, + {{I(68), "a11"}}, + {{I(68), "a12"}}}; + std::vector> b{{{U, "b0"}}, + {{U, "b1"}, {I(2), "b2"}, {I(42), "b3"}}, + {{I(42), "b4"}, {I(67), "b5"}}}; + std::vector> expectedResult{ + {F{U, "a0"}, F{U, "b0"}}, {F{U, "a0"}, F{U, "b1"}}, + {F{U, "a0"}, F{I(2), "b2"}}, {F{U, "a0"}, F{I(42), "b3"}}, + {F{U, "a0"}, F{I(42), "b4"}}, {F{I(42), "a1"}, F{U, "b0"}}, + {F{I(42), "a1"}, F{U, "b1"}}, {F{I(42), "a2"}, F{U, "b0"}}, + {F{I(42), "a2"}, F{U, "b1"}}, {F{I(42), "a3"}, F{U, "b0"}}, + {F{I(42), "a3"}, F{U, "b1"}}, {F{I(42), "a1"}, F{I(42), "b3"}}, + {F{I(42), "a2"}, F{I(42), "b3"}}, {F{I(42), "a3"}, F{I(42), "b3"}}, + {F{I(42), "a1"}, F{I(42), "b4"}}, {F{I(42), "a2"}, F{I(42), "b4"}}, + {F{I(42), "a3"}, F{I(42), "b4"}}, {F{U, "a0"}, F{I(67), "b5"}}, + {F{I(67), "a4"}, F{U, "b0"}}, {F{I(67), "a4"}, F{U, "b1"}}, + {F{I(67), "a5"}, F{U, "b0"}}, {F{I(67), "a5"}, F{U, "b1"}}, + {F{I(67), "a6"}, F{U, "b0"}}, {F{I(67), "a6"}, F{U, "b1"}}, + {F{I(67), "a7"}, F{U, "b0"}}, {F{I(67), "a7"}, F{U, "b1"}}, + {F{I(67), "a8"}, F{U, "b0"}}, {F{I(67), "a8"}, F{U, "b1"}}, + {F{I(67), "a4"}, F{I(67), "b5"}}, {F{I(67), "a5"}, F{I(67), "b5"}}, + {F{I(67), "a6"}, F{I(67), "b5"}}, {F{I(67), "a7"}, F{I(67), "b5"}}, + {F{I(67), "a8"}, F{I(67), "b5"}}, {F{I(68), "a9"}, F{U, "b0"}}, + {F{I(68), "a9"}, F{U, "b1"}}, {F{I(68), "a10"}, F{U, "b0"}}, + {F{I(68), "a10"}, F{U, "b1"}}, {F{I(68), "a11"}, F{U, "b0"}}, + {F{I(68), "a11"}, F{U, "b1"}}, {F{I(68), "a12"}, F{U, "b0"}}, + {F{I(68), "a12"}, F{U, "b1"}}, + }; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, JoinWithBlocksOneSideSingleUndef) { + auto U = Id::makeUndefined(); + std::vector> a{{{U, "a0"}}}; + std::vector> b{{{I(1), "b0"}, {I(2), "b1"}}}; + + std::vector> expectedResult{ + {F{U, "a0"}, F{I(1), "b0"}}, + {F{U, "a0"}, F{I(2), "b1"}}, + }; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, JoinWithBlocksOneUndefinedValueMixedWithOtherValues) { + auto U = Id::makeUndefined(); + std::vector> a{{{U, "a0"}, {I(1), "a1"}, {I(2), "a2"}}}; + std::vector> b{{{U, "b0"}, {I(2), "b1"}, {I(3), "b2"}}}; + + std::vector> expectedResult{ + {F{U, "a0"}, F{U, "b0"}}, {F{I(1), "a1"}, F{U, "b0"}}, + {F{U, "a0"}, F{I(2), "b1"}}, {F{I(2), "a2"}, F{U, "b0"}}, + {F{I(2), "a2"}, F{I(2), "b1"}}, {F{U, "a0"}, F{I(3), "b2"}}, + }; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, UndefinedJoinWorksWithoutUndefinedValues) { + std::vector> a{{{I(1), "a1"}, {I(2), "a2"}}}; + std::vector> b{{{I(2), "b1"}, {I(3), "b2"}}}; + + std::vector> expectedResult{ + {F{I(2), "a2"}, F{I(2), "b1"}}}; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, JoinWithBlocksMultipleGroupsAfterUndefined) { + auto U = Id::makeUndefined(); + std::vector> a{ + {{U, "a0"}, {I(1), "a1"}, {I(2), "a2"}, {I(3), "a3"}}}; + std::vector> b{ + {{U, "b0"}, {I(1), "b1"}, {I(2), "b2"}, {I(3), "b3"}}}; + + std::vector> expectedResult{ + {F{U, "a0"}, F{U, "b0"}}, {F{I(1), "a1"}, F{U, "b0"}}, + {F{U, "a0"}, F{I(1), "b1"}}, {F{I(1), "a1"}, F{I(1), "b1"}}, + {F{U, "a0"}, F{I(2), "b2"}}, {F{I(2), "a2"}, F{U, "b0"}}, + {F{I(2), "a2"}, F{I(2), "b2"}}, {F{U, "a0"}, F{I(3), "b3"}}, + {F{I(3), "a3"}, F{U, "b0"}}, {F{I(3), "a3"}, F{I(3), "b3"}}, + }; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, TrailingEmptyBlocksAreHandledWell) { + auto U = Id::makeUndefined(); + std::vector> a{ + {{U, "a0"}}, {{I(1), "a1"}}, {{I(2), "a2"}}, {{I(3), "a3"}}}; + std::vector> b{{{I(3), "b0"}}, {}, {}, {}, {}}; + + std::vector> expectedResult{ + {F{U, "a0"}, F{I(3), "b0"}}, {F{I(3), "a3"}, F{I(3), "b0"}}}; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithms, EmptyBlocksInTheMiddleAreHandledWell) { + auto U = Id::makeUndefined(); + std::vector> a{ + {{U, "a0"}}, {{I(1), "a1"}}, {{I(2), "a2"}}, {{I(3), "a3"}}}; + std::vector> b{{{I(1), "b0"}}, {}, {{I(1), "b1"}}, {}, {}, + {{I(3), "b2"}}}; + + std::vector> expectedResult{ + {F{U, "a0"}, F{I(1), "b0"}}, {F{U, "a0"}, F{I(1), "b1"}}, + {F{I(1), "a1"}, F{I(1), "b0"}}, {F{I(1), "a1"}, F{I(1), "b1"}}, + {F{U, "a0"}, F{I(3), "b2"}}, {F{I(3), "a3"}, F{I(3), "b2"}}}; + testDynamicJoinWithUndef(a, b, expectedResult); +} + +// _____________________________________________________________________________ +TEST(JoinAlgorithm, DefaultIsUndefinedFunctionAlwaysReturnsFalse) { + // This test is mostly for coverage purposes. + RowAdderWithUndef adder{}; + std::vector> dummyBlocks{}; + auto compare = [](auto l, auto r) { return static_cast(l) < r; }; + auto joinSide = + ad_utility::detail::makeJoinSide(dummyBlocks, std::identity{}); + ad_utility::detail::BlockZipperJoinImpl impl{joinSide, joinSide, compare, + adder}; + EXPECT_FALSE(impl.isUndefined_("Something")); + EXPECT_FALSE(impl.isUndefined_(1)); + EXPECT_FALSE(impl.isUndefined_(I(1))); + EXPECT_FALSE(impl.isUndefined_(Id::makeUndefined())); +} diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index a9058d3a68..38728a3c92 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -134,6 +134,14 @@ TEST(LocalVocab, clone) { for (size_t i = 0; i < inputWords.size(); ++i) { EXPECT_EQ(*indices[i], inputWords[i]); } + + // Test that a BlankNodeIndex obtained by a `LocalVocab` is also contained + // in the clone. + ad_utility::BlankNodeManager bnm; + LocalVocab v; + auto id = v.getBlankNodeIndex(&bnm); + LocalVocab vClone = v.clone(); + EXPECT_TRUE(vClone.isBlankNodeIndexContained(id)); } // _____________________________________________________________________________ TEST(LocalVocab, merge) { @@ -162,6 +170,30 @@ TEST(LocalVocab, merge) { EXPECT_EQ(*indices[1], lit("twoA")); EXPECT_EQ(*indices[2], lit("oneB")); EXPECT_EQ(*indices[3], lit("twoB")); + + // Test that the `LocalBlankNodeManager` of vocabs is merged correctly. + ad_utility::BlankNodeManager bnm; + LocalVocab localVocabMerged2; + BlankNodeIndex id; + { + LocalVocab vocC, vocD; + id = vocC.getBlankNodeIndex(&bnm); + auto vocabs2 = std::vector{&std::as_const(vocC), &std::as_const(vocD)}; + localVocabMerged2 = LocalVocab::merge(vocabs2); + } + EXPECT_TRUE(localVocabMerged2.isBlankNodeIndexContained(id)); + + LocalVocab vocE, vocF; + auto id2 = vocE.getBlankNodeIndex(&bnm); + auto vocabs3 = + std::vector{&std::as_const(localVocabMerged2), &std::as_const(vocF)}; + vocE.mergeWith(vocabs3 | std::views::transform( + [](const LocalVocab* l) -> const LocalVocab& { + return *l; + })); + EXPECT_TRUE(vocE.isBlankNodeIndexContained(id)); + EXPECT_TRUE(localVocabMerged2.isBlankNodeIndexContained(id)); + EXPECT_TRUE(vocE.isBlankNodeIndexContained(id2)); } // _____________________________________________________________________________