Skip to content

Commit

Permalink
Parallel parsing, fix logging and deprecation warnings (#1620)
Browse files Browse the repository at this point in the history
#1537 introduced various pecularities in the logging regarding parallel parsing. In particular, for a single input stream, a deprecation warning is shown even when `--parse-parallel` or `-p` are explicity specified on the command line, which is the recommended behavior. This code is now refactored, with a more uniform and informative logging, and a deprecation warning is only shown when it should be.
  • Loading branch information
hannahbast authored Nov 16, 2024
1 parent 26a3d42 commit 39ca684
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 54 deletions.
4 changes: 3 additions & 1 deletion src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,14 @@ int main(int argc, char** argv) {
}

bool parseInParallel = getParameterValue(i, parseParallel, false);
bool parseInParallelSetExplicitly = i < parseParallel.size();
auto& filename = inputFile.at(i);
if (filename == "-") {
filename = "/dev/stdin";
}
fileSpecs.emplace_back(filename, getFiletype(type, filename),
std::move(defaultGraph), parseInParallel);
std::move(defaultGraph), parseInParallel,
parseInParallelSetExplicitly);
}
return fileSpecs;
};
Expand Down
51 changes: 29 additions & 22 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,36 +291,43 @@ std::pair<size_t, size_t> IndexImpl::createInternalPSOandPOS(
void IndexImpl::updateInputFileSpecificationsAndLog(
std::vector<Index::InputFileSpecification>& spec,
std::optional<bool> parallelParsingSpecifiedViaJson) {
if (spec.size() == 1) {
LOG(INFO) << "Processing triples from " << spec.at(0).filename_ << " ..."
<< std::endl;
} else {
LOG(INFO) << "Processing triples from " << spec.size()
<< " input streams ..." << std::endl;
}
if (parallelParsingSpecifiedViaJson.value_or(false) == true) {
std::string pleaseUseParallelParsingOption =
"please use the command-line option --parse-parallel or -p";
// Parallel parsing specified in the `settings.json` file. This is deprecated
// for a single input stream and forbidden for multiple input streams.
if (parallelParsingSpecifiedViaJson.has_value()) {
if (spec.size() == 1) {
LOG(WARN) << "Parallel parsing set to `true` in the `.settings.json` "
"file; this is deprecated, please use the command-line "
" option --parse-parallel or -p instead"
<< std::endl;
spec.at(0).parseInParallel_ = true;
LOG(WARN) << "Parallel parsing set in the `.settings.json` file; this is "
"deprecated, "
<< pleaseUseParallelParsingOption << std::endl;
spec.at(0).parseInParallel_ = parallelParsingSpecifiedViaJson.value();
} else {
throw std::runtime_error{
"For more than one input file, the parallel parsing must not be "
"specified via the `.settings.json` file, but has to be specified "
" via the command-line option --parse-parallel or -p"};
throw std::runtime_error{absl::StrCat(
"Parallel parsing set in the `.settings.json` file; this is "
"forbidden for multiple input streams, ",
pleaseUseParallelParsingOption)};
}
}

if (spec.size() == 1 && !parallelParsingSpecifiedViaJson.has_value()) {
// For a single input stream, if parallel parsing is not specified explicitly
// on the command line, we set if implicitly for backward compatibility.
if (!parallelParsingSpecifiedViaJson.has_value() && spec.size() == 1 &&
!spec.at(0).parseInParallelSetExplicitly_) {
LOG(WARN) << "Implicitly using the parallel parser for a single input file "
"for reasons of backward compatibility; this is deprecated, "
"please use the command-line option --parse-parallel or -p "
"instead"
<< std::endl;
<< pleaseUseParallelParsingOption << std::endl;
spec.at(0).parseInParallel_ = true;
}
// For a single input stream, show the name and whether we parse in parallel.
// For multiple input streams, only show the number of streams.
if (spec.size() == 1) {
LOG(INFO) << "Parsing triples from single input stream "
<< spec.at(0).filename_ << " (parallel = "
<< (spec.at(0).parseInParallel_ ? "true" : "false") << ") ..."
<< std::endl;
} else {
LOG(INFO) << "Processing triples from " << spec.size()
<< " input streams ..." << std::endl;
}
}

// _____________________________________________________________________________
Expand Down
12 changes: 8 additions & 4 deletions src/index/InputFileSpecification.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach(joka921) <[email protected]>
//
// Copyright 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Johannes Kalmbach <[email protected]>

#pragma once

#include <optional>
Expand All @@ -27,6 +27,10 @@ struct InputFileSpecification {
// Turtle files with all prefixes at the beginning and no multiline literals.
bool parseInParallel_ = false;

// Remember if the value for parallel parsing was set explicitly (via the
// command line).
bool parseInParallelSetExplicitly_ = false;

bool operator==(const InputFileSpecification&) const = default;
};
} // namespace qlever
88 changes: 61 additions & 27 deletions test/IndexTest.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// Copyright 2015, University of Freiburg,
// Copyright 2015 - 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])
// Authors: Björn Buchhold <[email protected]> [2015 - 2017]
// Johannes Kalmbach <[email protected]>
// Hannah Bast <[email protected]>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
Expand Down Expand Up @@ -527,50 +529,82 @@ TEST(IndexTest, trivialGettersAndSetters) {

TEST(IndexTest, updateInputFileSpecificationsAndLog) {
using enum qlever::Filetype;
std::vector<qlever::InputFileSpecification> files{
{"singleFile.ttl", Turtle, std::nullopt, false}};
std::vector<qlever::InputFileSpecification> singleFileSpec = {
{"singleFile.ttl", Turtle, std::nullopt}};
std::vector<qlever::InputFileSpecification> twoFilesSpec = {
{"firstFile.ttl", Turtle, std::nullopt},
{"secondFile.ttl", Turtle, std::nullopt}};
using namespace ::testing;

// Parallel parsing not specified anywhere. For a single input stream, we then
// default to `true` for reasons of backwards compatibility, but this is
// deprecated. For multiple input streams, we default to `false` and this is
// normal behavior.
{
singleFileSpec.at(0).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, false);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec,
std::nullopt);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("singleFile.ttl"), HasSubstr("deprecated")));
EXPECT_TRUE(singleFileSpec.at(0).parseInParallel_);
}
{
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = false;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"), Not(HasSubstr("parallel"))));
EXPECT_FALSE(files.at(0).parseInParallel_);
AllOf(HasSubstr("from 2 input streams"), Not(HasSubstr("deprecated"))));
EXPECT_FALSE(twoFilesSpec.at(0).parseInParallel_);
EXPECT_FALSE(twoFilesSpec.at(1).parseInParallel_);
}

{
// Parallel parsing specified on the command line and not in the
// `settings.json`. This is normal behavior (no deprecation warning).
for (auto parallelParsing : {true, false}) {
singleFileSpec.at(0).parseInParallel_ = parallelParsing;
singleFileSpec.at(0).parseInParallelSetExplicitly_ = true;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, true);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"),
HasSubstr("settings.json"), HasSubstr("deprecated")));
EXPECT_TRUE(files.at(0).parseInParallel_);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec,
std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("singleFile.ttl"), Not(HasSubstr("deprecated"))));
EXPECT_EQ(singleFileSpec.at(0).parseInParallel_, parallelParsing);
}
{
twoFilesSpec.at(0).parseInParallel_ = true;
twoFilesSpec.at(1).parseInParallel_ = false;
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = true;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = true;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, std::nullopt);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"),
HasSubstr("single input"), HasSubstr("deprecated")));
EXPECT_TRUE(files.at(0).parseInParallel_);
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from 2 input streams"), Not(HasSubstr("deprecated"))));
EXPECT_TRUE(twoFilesSpec.at(0).parseInParallel_);
EXPECT_FALSE(twoFilesSpec.at(1).parseInParallel_);
}

// Parallel parsing not specified on the command line, but explicitly set in
// the `settings.json` file. This is deprecated for a single input
// stream and forbidden for multiple input streams.
{
files.emplace_back("secondFile.ttl", Turtle, std::nullopt, false);
auto filesCopy = files;
singleFileSpec.at(0).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, false);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec, true);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from 2 input streams"),
Not(HasSubstr("is deprecated"))));
EXPECT_EQ(files, filesCopy);
AllOf(HasSubstr("singleFile.ttl"), HasSubstr("deprecated")));
EXPECT_TRUE(singleFileSpec.at(0).parseInParallel_);
}

{
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = false;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = false;
AD_EXPECT_THROW_WITH_MESSAGE(
IndexImpl::updateInputFileSpecificationsAndLog(files, true),
HasSubstr("but has to be specified"));
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, true),
AllOf(Not(HasSubstr("from 2 input streams")), HasSubstr("forbidden")));
}
}

Expand Down

0 comments on commit 39ca684

Please sign in to comment.