Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deduplicate the "other sets" in a LocalVocab #1632

Merged
merged 10 commits into from
Dec 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
@@ -5,8 +5,6 @@

#include "engine/LocalVocab.h"

#include "absl/strings/str_cat.h"
#include "global/Id.h"
#include "global/ValueId.h"
#include "util/TransparentFunctors.h"

23 changes: 15 additions & 8 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
@@ -5,6 +5,9 @@

#pragma once

#include <absl/container/flat_hash_set.h>
#include <absl/container/node_hash_set.h>

#include <algorithm>
#include <cstdlib>
#include <memory>
@@ -13,9 +16,7 @@
#include <string>
#include <vector>

#include "absl/container/node_hash_set.h"
#include "global/Id.h"
#include "parser/LiteralOrIri.h"
#include "index/LocalVocabEntry.h"
#include "util/BlankNodeManager.h"
#include "util/Exception.h"

@@ -43,7 +44,7 @@ class LocalVocab {
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();

// The other sets of `LocalVocabEntry`s, which are static.
std::vector<std::shared_ptr<const Set>> otherWordSets_;
absl::flat_hash_set<std::shared_ptr<const Set>> otherWordSets_;

// The number of words (so that we can compute `size()` in constant time).
size_t size_ = 0;
@@ -114,12 +115,18 @@ class LocalVocab {
// primary set of this `LocalVocab` remains unchanged.
template <std::ranges::range R>
void mergeWith(const R& vocabs) {
auto inserter = std::back_inserter(otherWordSets_);
using std::views::filter;
auto addWordSet = [this](const std::shared_ptr<const Set>& set) {
bool added = otherWordSets_.insert(set).second;
size_ += static_cast<size_t>(added) * set->size();
};
// Note: Even though the `otherWordsSet_`is a hash set that filters out
// duplicates, we still manually filter out empty sets, because these
// typically don't compare equal to each other because of the`shared_ptr`
// semantics.
for (const auto& vocab : vocabs | filter(std::not_fn(&LocalVocab::empty))) {
std::ranges::copy(vocab.otherWordSets_, inserter);
*inserter = vocab.primaryWordSet_;
size_ += vocab.size_;
std::ranges::for_each(vocab.otherWordSets_, addWordSet);
addWordSet(vocab.primaryWordSet_);
}

// Also merge the `vocabs` `LocalBlankNodeManager`s, if they exist.
48 changes: 43 additions & 5 deletions src/engine/Operation.cpp
Original file line number Diff line number Diff line change
@@ -4,13 +4,41 @@

#include "engine/Operation.h"

#include <absl/cleanup/cleanup.h>

#include "engine/QueryExecutionTree.h"
#include "global/RuntimeParameters.h"
#include "util/OnDestructionDontThrowDuringStackUnwinding.h"
#include "util/TransparentFunctors.h"

using namespace std::chrono_literals;

namespace {
// Keep track of some statistics about the local vocabs of the results.
struct LocalVocabTracking {
size_t maxSize_ = 0;
size_t sizeSum_ = 0;
size_t totalVocabs_ = 0;
size_t nonEmptyVocabs_ = 0;

float avgSize() const {
return nonEmptyVocabs_ == 0 ? 0
: static_cast<float>(sizeSum_) /
static_cast<float>(nonEmptyVocabs_);
}
};

// Merge the stats of a single local vocab into the overall stats.
void mergeStats(LocalVocabTracking& stats, const LocalVocab& vocab) {
stats.maxSize_ = std::max(stats.maxSize_, vocab.size());
stats.sizeSum_ += vocab.size();
++stats.totalVocabs_;
if (!vocab.empty()) {
++stats.nonEmptyVocabs_;
}
}
} // namespace

//______________________________________________________________________________
template <typename F>
void Operation::forAllDescendants(F f) {
@@ -132,23 +160,33 @@ ProtoResult Operation::runComputation(const ad_utility::Timer& timer,
// correctly because the result was computed, so we can pass `nullopt` as
// the last argument.
if (result.isFullyMaterialized()) {
size_t numLocalVocabs = result.localVocab().numSets();
if (numLocalVocabs > 1) {
runtimeInfo().addDetail("num-local-vocabs", numLocalVocabs);
size_t vocabSize = result.localVocab().size();
if (vocabSize > 1) {
runtimeInfo().addDetail("local-vocab-size", vocabSize);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also want the number of non-local-vocabs that are merged together in the materialized local vocab here (if it is greater than >1), solocal-vocab-num-sets` would be a nice name here.

}
updateRuntimeInformationOnSuccess(result.idTable().size(),
ad_utility::CacheStatus::computed,
timer.msecs(), std::nullopt);
} else {
runtimeInfo().status_ = RuntimeInformation::lazilyMaterialized;
result.runOnNewChunkComputed(
[this, timeSizeUpdate = 0us](
const IdTable& idTable,
[this, timeSizeUpdate = 0us, vocabStats = LocalVocabTracking{}](
const Result::IdTableVocabPair& pair,
std::chrono::microseconds duration) mutable {
const IdTable& idTable = pair.idTable_;
updateRuntimeStats(false, idTable.numRows(), idTable.numColumns(),
duration);
LOG(DEBUG) << "Computed partial chunk of size " << idTable.numRows()
<< " x " << idTable.numColumns() << std::endl;
mergeStats(vocabStats, pair.localVocab_);
if (vocabStats.sizeSum_ > 0) {
runtimeInfo().addDetail(
"non-empty-local-vocabs",
absl::StrCat(vocabStats.nonEmptyVocabs_, " / ",
vocabStats.totalVocabs_,
", Ø = ", vocabStats.avgSize(),
", max = ", vocabStats.maxSize_));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the number of contained sets (similar statistics, max/avg) would be good to know here.

}
timeSizeUpdate += duration;
if (timeSizeUpdate > 50ms) {
timeSizeUpdate = 0us;
5 changes: 3 additions & 2 deletions src/engine/Result.cpp
Original file line number Diff line number Diff line change
@@ -208,7 +208,8 @@ void Result::checkDefinedness(const VariableToColumnMap& varColMap) {

// _____________________________________________________________________________
void Result::runOnNewChunkComputed(
std::function<void(const IdTable&, std::chrono::microseconds)> onNewChunk,
std::function<void(const IdTableVocabPair&, std::chrono::microseconds)>
onNewChunk,
std::function<void(bool)> onGeneratorFinished) {
AD_CONTRACT_CHECK(!isFullyMaterialized());
auto generator = [](Generator original, auto onNewChunk,
@@ -220,7 +221,7 @@ void Result::runOnNewChunkComputed(
try {
ad_utility::timer::Timer timer{ad_utility::timer::Timer::Started};
for (IdTableVocabPair& pair : original) {
onNewChunk(pair.idTable_, timer.value());
onNewChunk(pair, timer.value());
co_yield pair;
timer.start();
}
9 changes: 5 additions & 4 deletions src/engine/Result.h
Original file line number Diff line number Diff line change
@@ -119,9 +119,9 @@ class Result {
Result& operator=(Result&& other) = default;

// Wrap the generator stored in `data_` within a new generator that calls
// `onNewChunk` every time a new `IdTable` is yielded by the original
// generator and passed this new `IdTable` along with microsecond precision
// timing information on how long it took to compute this new chunk.
// `onNewChunk` every time a new `IdTableVocabPair` is yielded by the original
// generator and passed this new `IdTableVocabPair` along with microsecond
// precision timing information on how long it took to compute this new chunk.
// `onGeneratorFinished` is guaranteed to be called eventually as long as the
// generator is consumed at least partially, with `true` if an exception
// occurred during consumption or with `false` when the generator is done
@@ -130,7 +130,8 @@ class Result {
// Throw an `ad_utility::Exception` if the underlying `data_` member holds the
// wrong variant.
void runOnNewChunkComputed(
std::function<void(const IdTable&, std::chrono::microseconds)> onNewChunk,
std::function<void(const IdTableVocabPair&, std::chrono::microseconds)>
onNewChunk,
std::function<void(bool)> onGeneratorFinished);

// Wrap the generator stored in `data_` within a new generator that aggregates
3 changes: 0 additions & 3 deletions src/parser/LiteralOrIri.h
Original file line number Diff line number Diff line change
@@ -4,13 +4,10 @@

#pragma once

#include <absl/strings/str_cat.h>

#include <variant>

#include "parser/Iri.h"
#include "parser/Literal.h"
#include "util/Exception.h"

namespace ad_utility::triple_component {
static constexpr char literalPrefixChar = '"';
42 changes: 42 additions & 0 deletions test/LocalVocabTest.cpp
Original file line number Diff line number Diff line change
@@ -410,3 +410,45 @@ TEST(LocalVocab, getBlankNodeIndex) {
BlankNodeIndex b = v.getBlankNodeIndex(&bnm);
EXPECT_NE(a, b);
}

// _____________________________________________________________________________
TEST(LocalVocab, otherWordSetIsTransitivelyPropagated) {
using ad_utility::triple_component::LiteralOrIri;
LocalVocab original;
original.getIndexAndAddIfNotContained(
LocalVocabEntry{LiteralOrIri::literalWithoutQuotes("test")});

LocalVocab clone = original.clone();
LocalVocab mergeCandidate;
mergeCandidate.mergeWith(std::span{&clone, 1});

EXPECT_EQ(mergeCandidate.size(), 1);
EXPECT_THAT(mergeCandidate.getAllWordsForTesting(),
::testing::UnorderedElementsAre(
LiteralOrIri::literalWithoutQuotes("test")));
}

// _____________________________________________________________________________
TEST(LocalVocab, sizeIsProperlyUpdatedOnMerge) {
using ad_utility::triple_component::LiteralOrIri;
using ::testing::UnorderedElementsAre;
LocalVocab original;
original.getIndexAndAddIfNotContained(
LocalVocabEntry{LiteralOrIri::literalWithoutQuotes("test")});

LocalVocab clone1 = original.clone();
LocalVocab clone2 = original.clone();
clone2.mergeWith(std::span{&original, 1});
original.mergeWith(std::span{&clone1, 1});

// Implementation detail, merging does add to the "other word set" but does
// not deduplicate with the primary word set.
EXPECT_EQ(original.size(), 2);
EXPECT_THAT(original.getAllWordsForTesting(),
UnorderedElementsAre(LiteralOrIri::literalWithoutQuotes("test"),
LiteralOrIri::literalWithoutQuotes("test")));

EXPECT_EQ(clone2.size(), 1);
EXPECT_THAT(clone2.getAllWordsForTesting(),
UnorderedElementsAre(LiteralOrIri::literalWithoutQuotes("test")));
}
14 changes: 13 additions & 1 deletion test/OperationTest.cpp
Original file line number Diff line number Diff line change
@@ -351,8 +351,12 @@ TEST(Operation, verifyRuntimeInformationIsUpdatedForLazyOperations) {
std::vector<IdTable> idTablesVector{};
idTablesVector.push_back(makeIdTableFromVector({{3, 4}}));
idTablesVector.push_back(makeIdTableFromVector({{7, 8}}));
LocalVocab localVocab{};
localVocab.getIndexAndAddIfNotContained(LocalVocabEntry{
ad_utility::triple_component::Literal::literalWithoutQuotes("Test")});
ValuesForTesting valuesForTesting{
qec, std::move(idTablesVector), {Variable{"?x"}, Variable{"?y"}}};
qec, std::move(idTablesVector), {Variable{"?x"}, Variable{"?y"}},
false, std::vector<ColumnIndex>{}, std::move(localVocab)};

ad_utility::Timer timer{ad_utility::Timer::InitialStatus::Started};
EXPECT_THROW(
@@ -374,14 +378,22 @@ TEST(Operation, verifyRuntimeInformationIsUpdatedForLazyOperations) {
{[&]() {
EXPECT_EQ(rti.status_, Status::lazilyMaterialized);
expectRtiHasDimensions(rti, 2, 1);
ASSERT_TRUE(rti.details_.contains("non-empty-local-vocabs"));
EXPECT_EQ(rti.details_["non-empty-local-vocabs"],
"1 / 1, Ø = 1, max = 1");
},
[&]() {
EXPECT_EQ(rti.status_, Status::lazilyMaterialized);
expectRtiHasDimensions(rti, 2, 2);
ASSERT_TRUE(rti.details_.contains("non-empty-local-vocabs"));
EXPECT_EQ(rti.details_["non-empty-local-vocabs"],
"2 / 2, Ø = 1, max = 1");
}});

EXPECT_EQ(rti.status_, Status::lazilyMaterialized);
expectRtiHasDimensions(rti, 2, 2);
ASSERT_TRUE(rti.details_.contains("non-empty-local-vocabs"));
EXPECT_EQ(rti.details_["non-empty-local-vocabs"], "2 / 2, Ø = 1, max = 1");
}

// _____________________________________________________________________________
Loading
Loading