Skip to content

Commit

Permalink
BASE declarations are now handled correctly (#1660)
Browse files Browse the repository at this point in the history
When a `BASE` IRI is defined, IRIs without a scheme (like `<http://...>`) should be resolved. For example, with `BASE <http://purl.uniprot.org/uniprot/>`, the relative IRI `<UPI001AF4585D>` should be resolved to `<http://purl.uniprot.org/uniprot/UPI001AF4585D>`, and the absolute IRI  `</prosite/PS51927>` should be resolved to `<http://purl.uniprot.org/prosite/PS51927>`.

NOTE: Without `BASE` declaration, relative IRIs like `<a>` are accepted and left unchanged. This is used extensively in our tests. This is not 100% compliant with the SPARQL standard, but we consider being slightly more accepting than the standard harmless in this case.
  • Loading branch information
hannahbast authored Dec 10, 2024
1 parent 01d8306 commit 0400f90
Show file tree
Hide file tree
Showing 8 changed files with 272 additions and 43 deletions.
61 changes: 58 additions & 3 deletions src/parser/Iri.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Benedikt Maria Beckermann <[email protected]>
// Copyright 2023 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Authors: Benedikt Maria Beckermann <[email protected]>
// Hannah Bast <[email protected]>

#include "parser/Iri.h"

Expand Down Expand Up @@ -47,6 +48,60 @@ Iri Iri::fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix) {
return Iri{prefix, asNormalizedStringViewUnsafe(suffixNormalized)};
}

// ____________________________________________________________________________
Iri Iri::getBaseIri(bool domainOnly) const {
AD_CORRECTNESS_CHECK(iri_.starts_with('<') && iri_.ends_with('>'), iri_);
// Check if we have a scheme and find the first `/` after that (or the first
// `/` at all if there is no scheme).
size_t pos = iri_.find(schemePattern);
if (pos == std::string::npos) {
LOG(WARN) << "No scheme found in base IRI: \"" << iri_ << "\""
<< " (but we accept it anyway)" << std::endl;
pos = 1;
} else {
pos += schemePattern.size();
}
pos = iri_.find('/', pos);
// Return the IRI with `/` appended in the following two cases: the IRI has
// the empty path, or `domainOnly` is false and the final `/` is missing.
if (pos == std::string::npos ||
(!domainOnly && iri_[iri_.size() - 2] != '/')) {
return fromIrirefWithoutBrackets(
absl::StrCat(std::string_view(iri_).substr(1, iri_.size() - 2), "/"sv));
}
// If `domainOnly` is true, remove the path part.
if (domainOnly) {
return fromIrirefWithoutBrackets(std::string_view(iri_).substr(1, pos));
}
// Otherwise, return the IRI as is.
return *this;
}

// ____________________________________________________________________________
Iri Iri::fromIrirefConsiderBase(std::string_view iriStringWithBrackets,
const Iri& basePrefixForRelativeIris,
const Iri& basePrefixForAbsoluteIris) {
auto iriSv = iriStringWithBrackets;
AD_CORRECTNESS_CHECK(iriSv.size() >= 2);
AD_CORRECTNESS_CHECK(iriSv[0] == '<' && iriSv[iriSv.size() - 1] == '>');
if (iriSv.find("://") != std::string_view::npos ||
basePrefixForAbsoluteIris.empty()) {
// Case 1: IRI with scheme (like `<http://...>`) or `BASE_IRI_FOR_TESTING`
// (which is `<@>`, and no valid base IRI has length 3).
return TripleComponent::Iri::fromIriref(iriSv);
} else if (iriSv[1] == '/') {
// Case 2: Absolute IRI without scheme (like `</prosite/PS51927>`).
AD_CORRECTNESS_CHECK(!basePrefixForAbsoluteIris.empty());
return TripleComponent::Iri::fromPrefixAndSuffix(
basePrefixForAbsoluteIris, iriSv.substr(2, iriSv.size() - 3));
} else {
// Case 3: Relative IRI (like `<UPI001AF4585D>`).
AD_CORRECTNESS_CHECK(!basePrefixForRelativeIris.empty());
return TripleComponent::Iri::fromPrefixAndSuffix(
basePrefixForRelativeIris, iriSv.substr(1, iriSv.size() - 2));
}
}

// ____________________________________________________________________________
Iri Iri::fromStringRepresentation(std::string s) {
AD_CORRECTNESS_CHECK(s.starts_with("<") || s.starts_with("@"));
Expand Down
39 changes: 30 additions & 9 deletions src/parser/Iri.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Benedikt Maria Beckermann <[email protected]>
// Copyright 2023 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Authors: Benedikt Maria Beckermann <[email protected]>
// Hannah Bast <[email protected]>

#pragma once

Expand All @@ -14,15 +15,18 @@ namespace ad_utility::triple_component {
class Iri {
private:
// Store the string value of the IRI including the angle brackets.
// brackets.
std::string iri_;

// Create a new iri object
// Create a new `Iri` object
explicit Iri(std::string iri);

// Create a new iri using a prefix
// Create a new `Iri` using a prefix
Iri(const Iri& prefix, NormalizedStringView suffix);

// Pattern used to identify the scheme in an IRI. Note that we do not
// check the validity of the part before the `://` according to RFC 3987.
static constexpr std::string_view schemePattern = "://";

public:
// A default constructed IRI is empty.
Iri() = default;
Expand All @@ -36,15 +40,32 @@ class Iri {
const std::string& toStringRepresentation() const;
std::string& toStringRepresentation();

// Create a new `Iri` given an iri string with brackets.
// Create a new `Ìri` given an IRI string with brackets.
static Iri fromIriref(std::string_view stringWithBrackets);

// Create a new `Iri` given an iri string without brackets.
// Create a new `Iri` given an IRI string without brackets.
static Iri fromIrirefWithoutBrackets(std::string_view stringWithoutBrackets);

// Create a new iri given a prefix iri and its suffix
// Create a new `Iri` given a prefix IRI and its suffix
static Iri fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix);

// Create a new `Iri` object, considering the base IRI. For IRIs with a scheme
// (like `<http://...>`), this is the same as `fromIriref`. For IRIs without a
// scheme, prepend the base prefix for relative IRIs (like `<UPI001AF4585D>`)
// or for absolute IRIs (like `</prosite/PS51927>`).
static Iri fromIrirefConsiderBase(std::string_view iriStringWithBrackets,
const Iri& basePrefixForRelativeIris,
const Iri& basePrefixForAbsoluteIris);

// Get the base IRI from this `Iri` object. The returned `Iri` always has a
// `/` at the end. If `domainOnly` is true, remove the path part, for
// example, for `<http://purl.uniprot.org/uniprot/>` the method returns
// `<http://purl.uniprot.org/>`.
Iri getBaseIri(bool domainOnly) const;

// Return true iff the IRI is empty.
bool empty() const { return iri_.empty(); }

// Return the string value of the iri object without any leading or trailing
// angled brackets.
NormalizedStringView getContent() const;
Expand Down
18 changes: 17 additions & 1 deletion src/parser/ParallelBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ std::optional<size_t> ParallelBufferWithEndRegex::findRegexNearEnd(
// _____________________________________________________________________________
std::optional<ParallelBuffer::BufferType>
ParallelBufferWithEndRegex::getNextBlock() {
// Get the block of data read asynchronously after the previous call
// to `getNextBlock`.
auto rawInput = rawBuffer_.getNextBlock();

// If there was no more data, return the remainder or `std::nullopt` if
// it is empty.
if (!rawInput || exhausted_) {
exhausted_ = true;
if (remainder_.empty()) {
Expand All @@ -85,7 +90,15 @@ ParallelBufferWithEndRegex::getNextBlock() {
return copy;
}

// Find `endRegex_` in the data (searching from the back, in chunks of
// exponentially increasing size). Note that this does not necessarily
// find the last match of `endRegex_` in the data, but the first match in the
// last chunk (from the back), where there is a match.
auto endPosition = findRegexNearEnd(rawInput.value(), endRegex_);

// If no match was found at all, report an error, except when this is the
// last block (then `getNextBlock` will return `std::nullopt`, and we simply
// concatenate it to the remainder).
if (!endPosition) {
if (rawBuffer_.getNextBlock()) {
throw std::runtime_error(absl::StrCat(
Expand All @@ -95,10 +108,13 @@ ParallelBufferWithEndRegex::getNextBlock() {
"increase the FILE_BUFFER_SIZE "
"or set \"parallel-parsing: false\" in the settings file."));
}
// This was the last (possibly incomplete) block, simply concatenate
endPosition = rawInput->size();
exhausted_ = true;
}

// Concatenate the remainder (part after `endRegex_`) of the block from the
// previous round with the part of the block until `endRegex_` from this
// round.
BufferType result;
result.reserve(remainder_.size() + *endPosition);
result.insert(result.end(), remainder_.begin(), remainder_.end());
Expand Down
10 changes: 6 additions & 4 deletions src/parser/ParallelBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,19 @@ class ParallelFileBuffer : public ParallelBuffer {
std::future<size_t> fut_;
};

/// A parallel buffer, where each of the blocks except for the last one has to
/// end with a certain regex (e.g. a full stop followed by whitespace and a
/// newline to denote the end of a triple in a .ttl file).
// A parallel buffer that reads input from the file in blocks, where each block,
// except possibly the last, ends with `endRegex`.
class ParallelBufferWithEndRegex : public ParallelBuffer {
public:
ParallelBufferWithEndRegex(size_t blocksize, std::string endRegex)
: ParallelBuffer{blocksize},
endRegex_{endRegex},
endRegexAsString_{std::move(endRegex)} {}

// __________________________________________________________________________
// Get the data that was read asynchronously after the previous call to this
// function. Returns the part of the data until `endRegex` is found, with the
// part after `endRegex` from the previous call prepended. If `endRegex` is
// not found, simply return the rest of the data.
std::optional<BufferType> getNextBlock() override;

// Open the file from which the blocks are read.
Expand Down
40 changes: 28 additions & 12 deletions src/parser/RdfParser.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2018, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach(joka921) <[email protected]>
// Copyright 2018 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Authors: Johannes Kalmbach <[email protected]>
// Hannah Bast <[email protected]>

#include "parser/RdfParser.h"

Expand Down Expand Up @@ -55,7 +56,9 @@ template <class T>
bool TurtleParser<T>::base() {
if (skip<TurtleTokenId::TurtleBase>()) {
if (iriref() && check(skip<TurtleTokenId::Dot>())) {
prefixMap_[""] = lastParseResult_.getIri();
const auto& iri = lastParseResult_.getIri();
prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false);
prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true);
return true;
} else {
raise("Parsing @base definition failed");
Expand Down Expand Up @@ -85,7 +88,9 @@ template <class T>
bool TurtleParser<T>::sparqlBase() {
if (skip<TurtleTokenId::SparqlBase>()) {
if (iriref()) {
prefixMap_[""] = lastParseResult_.getIri();
auto iri = lastParseResult_.getIri();
prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false);
prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true);
return true;
} else {
raise("Parsing BASE definition failed");
Expand Down Expand Up @@ -745,8 +750,8 @@ bool TurtleParser<T>::iriref() {
// more relaxed way.
if constexpr (UseRelaxedParsing) {
tok_.remove_prefix(endPos + 1);
lastParseResult_ =
TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1));
lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase(
view.substr(0, endPos + 1), baseForRelativeIri(), baseForAbsoluteIri());
return true;
} else {
if (!parseTerminal<TurtleTokenId::Iriref>()) {
Expand All @@ -756,8 +761,9 @@ bool TurtleParser<T>::iriref() {
return false;
}
}
lastParseResult_ =
TripleComponent::Iri::fromIriref(lastParseResult_.getString());
lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase(
lastParseResult_.getString(), baseForRelativeIri(),
baseForAbsoluteIri());
return true;
}
}
Expand Down Expand Up @@ -817,7 +823,19 @@ bool RdfStreamParser<T>::resetStateAndRead(
template <class T>
void RdfStreamParser<T>::initialize(const string& filename) {
this->clear();
fileBuffer_ = std::make_unique<ParallelFileBuffer>(bufferSize_);
// Make sure that a block of data ends with a newline. This is important for
// two reasons:
//
// 1. A block of data must not end in the middle of a comment. Otherwise the
// remaining part of the comment, which is prepended to the next block, is
// not recognized as a comment.
//
// 2. A block of data must not end with a `.` (without subsequent newline).
// The reason is that with a `.` at the end, we cannot decide whether we are
// in the middle of a `PN_LOCAL` (that continues in the next buffer) or at the
// end of a statement.
fileBuffer_ =
std::make_unique<ParallelBufferWithEndRegex>(bufferSize_, "([\\r\\n]+)");
fileBuffer_->open(filename);
byteVec_.resize(bufferSize_);
// decompress the first block and initialize Tokenizer
Expand Down Expand Up @@ -847,8 +865,6 @@ bool RdfStreamParser<T>::getLineImpl(TurtleTriple* triple) {
// immediately rethrown. If we are reading from a stream in chunks of
// bytes, we can try again with a larger buffer.
try {
// variable parsedStatement will be true iff a statement can
// successfully be parsed
parsedStatement = T::statement();
} catch (const typename T::ParseException& p) {
parsedStatement = false;
Expand Down
36 changes: 28 additions & 8 deletions src/parser/RdfParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ class TurtleParser : public RdfParserBase {
static constexpr std::array<const char*, 3> floatDatatypes_ = {
XSD_DECIMAL_TYPE, XSD_DOUBLE_TYPE, XSD_FLOAT_TYPE};

// The keys for storing the base prefix (for relative and absolute IRIs) in
// the prefix map. The only thing that is important about these keys is that
// they are different from each other and from any valid prefix name.
static constexpr const char* baseForRelativeIriKey_ = "@";
static constexpr const char* baseForAbsoluteIriKey_ = "@@";

protected:
// Data members.

Expand All @@ -187,9 +193,23 @@ class TurtleParser : public RdfParserBase {
// `TripleComponent` since it can hold any parsing result, not only objects.
TripleComponent lastParseResult_;

// Maps prefixes to their expanded form, initialized with the empty base
// (i.e. the prefix ":" maps to the empty IRI).
ad_utility::HashMap<std::string, TripleComponent::Iri> prefixMap_{{{}, {}}};
// Map that maps prefix names to their IRI. For our tests, it is important
// that without any BASE declaration, the two base prefixes are mapped to the
// empty IRI.
static const inline ad_utility::HashMap<std::string, TripleComponent::Iri>
prefixMapDefault_{{baseForRelativeIriKey_, TripleComponent::Iri{}},
{baseForAbsoluteIriKey_, TripleComponent::Iri{}}};
ad_utility::HashMap<std::string, TripleComponent::Iri> prefixMap_ =
prefixMapDefault_;

// Getters for the two base prefixes. Without BASE declaration, these will
// both return the empty IRI.
const TripleComponent::Iri& baseForRelativeIri() {
return prefixMap_.at(baseForRelativeIriKey_);
}
const TripleComponent::Iri& baseForAbsoluteIri() {
return prefixMap_.at(baseForAbsoluteIriKey_);
}

// There are turtle constructs that reuse prefixes, subjects and predicates
// so we have to save the last seen ones.
Expand Down Expand Up @@ -222,7 +242,7 @@ class TurtleParser : public RdfParserBase {
activePredicate_ = TripleComponent::Iri::fromIriref("<>");
activePrefix_.clear();

prefixMap_.clear();
prefixMap_ = prefixMapDefault_;

tok_.reset(nullptr, 0);
triples_.clear();
Expand Down Expand Up @@ -400,6 +420,8 @@ class TurtleParser : public RdfParserBase {
FRIEND_TEST(RdfParserTest, predicateObjectList);
FRIEND_TEST(RdfParserTest, objectList);
FRIEND_TEST(RdfParserTest, object);
FRIEND_TEST(RdfParserTest, base);
FRIEND_TEST(RdfParserTest, sparqlBase);
FRIEND_TEST(RdfParserTest, blankNode);
FRIEND_TEST(RdfParserTest, blankNodePropertyList);
FRIEND_TEST(RdfParserTest, numericLiteral);
Expand Down Expand Up @@ -516,8 +538,6 @@ class RdfStringParser : public Parser {
this->tok_.reset(tmpToParse_.data(), tmpToParse_.size());
}

void setPrefixMap(decltype(prefixMap_) m) { prefixMap_ = std::move(m); }

const auto& getPrefixMap() const { return prefixMap_; }

// __________________________________________________________
Expand Down Expand Up @@ -604,10 +624,10 @@ class RdfStreamParser : public Parser {
// that's why we need the backupState() and resetStateAndRead() methods
ParallelBuffer::BufferType byteVec_;

std::unique_ptr<ParallelBuffer> fileBuffer_;
size_t bufferSize_ = FILE_BUFFER_SIZE;
std::unique_ptr<ParallelBufferWithEndRegex> fileBuffer_;
// this many characters will be buffered at once,
// defaults to a global constant
size_t bufferSize_ = FILE_BUFFER_SIZE;

// that many bytes were already parsed before dealing with the current batch
// in member byteVec_
Expand Down
Loading

0 comments on commit 0400f90

Please sign in to comment.