-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BASE
declarations are now handled correctly (#1660)
When a `BASE` IRI is defined, IRIs without a scheme (like `<http://...>`) should be resolved. For example, with `BASE <http://purl.uniprot.org/uniprot/>`, the relative IRI `<UPI001AF4585D>` should be resolved to `<http://purl.uniprot.org/uniprot/UPI001AF4585D>`, and the absolute IRI `</prosite/PS51927>` should be resolved to `<http://purl.uniprot.org/prosite/PS51927>`. NOTE: Without `BASE` declaration, relative IRIs like `<a>` are accepted and left unchanged. This is used extensively in our tests. This is not 100% compliant with the SPARQL standard, but we consider being slightly more accepting than the standard harmless in this case.
- Loading branch information
1 parent
01d8306
commit 0400f90
Showing
8 changed files
with
272 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
// Copyright 2023, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Benedikt Maria Beckermann <[email protected]> | ||
// Copyright 2023 - 2024, University of Freiburg | ||
// Chair of Algorithms and Data Structures | ||
// Authors: Benedikt Maria Beckermann <[email protected]> | ||
// Hannah Bast <[email protected]> | ||
|
||
#include "parser/Iri.h" | ||
|
||
|
@@ -47,6 +48,60 @@ Iri Iri::fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix) { | |
return Iri{prefix, asNormalizedStringViewUnsafe(suffixNormalized)}; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
Iri Iri::getBaseIri(bool domainOnly) const { | ||
AD_CORRECTNESS_CHECK(iri_.starts_with('<') && iri_.ends_with('>'), iri_); | ||
// Check if we have a scheme and find the first `/` after that (or the first | ||
// `/` at all if there is no scheme). | ||
size_t pos = iri_.find(schemePattern); | ||
if (pos == std::string::npos) { | ||
LOG(WARN) << "No scheme found in base IRI: \"" << iri_ << "\"" | ||
<< " (but we accept it anyway)" << std::endl; | ||
pos = 1; | ||
} else { | ||
pos += schemePattern.size(); | ||
} | ||
pos = iri_.find('/', pos); | ||
// Return the IRI with `/` appended in the following two cases: the IRI has | ||
// the empty path, or `domainOnly` is false and the final `/` is missing. | ||
if (pos == std::string::npos || | ||
(!domainOnly && iri_[iri_.size() - 2] != '/')) { | ||
return fromIrirefWithoutBrackets( | ||
absl::StrCat(std::string_view(iri_).substr(1, iri_.size() - 2), "/"sv)); | ||
} | ||
// If `domainOnly` is true, remove the path part. | ||
if (domainOnly) { | ||
return fromIrirefWithoutBrackets(std::string_view(iri_).substr(1, pos)); | ||
} | ||
// Otherwise, return the IRI as is. | ||
return *this; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
Iri Iri::fromIrirefConsiderBase(std::string_view iriStringWithBrackets, | ||
const Iri& basePrefixForRelativeIris, | ||
const Iri& basePrefixForAbsoluteIris) { | ||
auto iriSv = iriStringWithBrackets; | ||
AD_CORRECTNESS_CHECK(iriSv.size() >= 2); | ||
AD_CORRECTNESS_CHECK(iriSv[0] == '<' && iriSv[iriSv.size() - 1] == '>'); | ||
if (iriSv.find("://") != std::string_view::npos || | ||
basePrefixForAbsoluteIris.empty()) { | ||
// Case 1: IRI with scheme (like `<http://...>`) or `BASE_IRI_FOR_TESTING` | ||
// (which is `<@>`, and no valid base IRI has length 3). | ||
return TripleComponent::Iri::fromIriref(iriSv); | ||
} else if (iriSv[1] == '/') { | ||
// Case 2: Absolute IRI without scheme (like `</prosite/PS51927>`). | ||
AD_CORRECTNESS_CHECK(!basePrefixForAbsoluteIris.empty()); | ||
return TripleComponent::Iri::fromPrefixAndSuffix( | ||
basePrefixForAbsoluteIris, iriSv.substr(2, iriSv.size() - 3)); | ||
} else { | ||
// Case 3: Relative IRI (like `<UPI001AF4585D>`). | ||
AD_CORRECTNESS_CHECK(!basePrefixForRelativeIris.empty()); | ||
return TripleComponent::Iri::fromPrefixAndSuffix( | ||
basePrefixForRelativeIris, iriSv.substr(1, iriSv.size() - 2)); | ||
} | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
Iri Iri::fromStringRepresentation(std::string s) { | ||
AD_CORRECTNESS_CHECK(s.starts_with("<") || s.starts_with("@")); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
// Copyright 2023, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Benedikt Maria Beckermann <[email protected]> | ||
// Copyright 2023 - 2024, University of Freiburg | ||
// Chair of Algorithms and Data Structures | ||
// Authors: Benedikt Maria Beckermann <[email protected]> | ||
// Hannah Bast <[email protected]> | ||
|
||
#pragma once | ||
|
||
|
@@ -14,15 +15,18 @@ namespace ad_utility::triple_component { | |
class Iri { | ||
private: | ||
// Store the string value of the IRI including the angle brackets. | ||
// brackets. | ||
std::string iri_; | ||
|
||
// Create a new iri object | ||
// Create a new `Iri` object | ||
explicit Iri(std::string iri); | ||
|
||
// Create a new iri using a prefix | ||
// Create a new `Iri` using a prefix | ||
Iri(const Iri& prefix, NormalizedStringView suffix); | ||
|
||
// Pattern used to identify the scheme in an IRI. Note that we do not | ||
// check the validity of the part before the `://` according to RFC 3987. | ||
static constexpr std::string_view schemePattern = "://"; | ||
|
||
public: | ||
// A default constructed IRI is empty. | ||
Iri() = default; | ||
|
@@ -36,15 +40,32 @@ class Iri { | |
const std::string& toStringRepresentation() const; | ||
std::string& toStringRepresentation(); | ||
|
||
// Create a new `Iri` given an iri string with brackets. | ||
// Create a new `Ìri` given an IRI string with brackets. | ||
static Iri fromIriref(std::string_view stringWithBrackets); | ||
|
||
// Create a new `Iri` given an iri string without brackets. | ||
// Create a new `Iri` given an IRI string without brackets. | ||
static Iri fromIrirefWithoutBrackets(std::string_view stringWithoutBrackets); | ||
|
||
// Create a new iri given a prefix iri and its suffix | ||
// Create a new `Iri` given a prefix IRI and its suffix | ||
static Iri fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix); | ||
|
||
// Create a new `Iri` object, considering the base IRI. For IRIs with a scheme | ||
// (like `<http://...>`), this is the same as `fromIriref`. For IRIs without a | ||
// scheme, prepend the base prefix for relative IRIs (like `<UPI001AF4585D>`) | ||
// or for absolute IRIs (like `</prosite/PS51927>`). | ||
static Iri fromIrirefConsiderBase(std::string_view iriStringWithBrackets, | ||
const Iri& basePrefixForRelativeIris, | ||
const Iri& basePrefixForAbsoluteIris); | ||
|
||
// Get the base IRI from this `Iri` object. The returned `Iri` always has a | ||
// `/` at the end. If `domainOnly` is true, remove the path part, for | ||
// example, for `<http://purl.uniprot.org/uniprot/>` the method returns | ||
// `<http://purl.uniprot.org/>`. | ||
Iri getBaseIri(bool domainOnly) const; | ||
|
||
// Return true iff the IRI is empty. | ||
bool empty() const { return iri_.empty(); } | ||
|
||
// Return the string value of the iri object without any leading or trailing | ||
// angled brackets. | ||
NormalizedStringView getContent() const; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
// Copyright 2018, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Johannes Kalmbach(joka921) <[email protected]> | ||
// Copyright 2018 - 2024, University of Freiburg | ||
// Chair of Algorithms and Data Structures | ||
// Authors: Johannes Kalmbach <[email protected]> | ||
// Hannah Bast <[email protected]> | ||
|
||
#include "parser/RdfParser.h" | ||
|
||
|
@@ -55,7 +56,9 @@ template <class T> | |
bool TurtleParser<T>::base() { | ||
if (skip<TurtleTokenId::TurtleBase>()) { | ||
if (iriref() && check(skip<TurtleTokenId::Dot>())) { | ||
prefixMap_[""] = lastParseResult_.getIri(); | ||
const auto& iri = lastParseResult_.getIri(); | ||
prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false); | ||
prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true); | ||
return true; | ||
} else { | ||
raise("Parsing @base definition failed"); | ||
|
@@ -85,7 +88,9 @@ template <class T> | |
bool TurtleParser<T>::sparqlBase() { | ||
if (skip<TurtleTokenId::SparqlBase>()) { | ||
if (iriref()) { | ||
prefixMap_[""] = lastParseResult_.getIri(); | ||
auto iri = lastParseResult_.getIri(); | ||
prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false); | ||
prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true); | ||
return true; | ||
} else { | ||
raise("Parsing BASE definition failed"); | ||
|
@@ -745,8 +750,8 @@ bool TurtleParser<T>::iriref() { | |
// more relaxed way. | ||
if constexpr (UseRelaxedParsing) { | ||
tok_.remove_prefix(endPos + 1); | ||
lastParseResult_ = | ||
TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1)); | ||
lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase( | ||
view.substr(0, endPos + 1), baseForRelativeIri(), baseForAbsoluteIri()); | ||
return true; | ||
} else { | ||
if (!parseTerminal<TurtleTokenId::Iriref>()) { | ||
|
@@ -756,8 +761,9 @@ bool TurtleParser<T>::iriref() { | |
return false; | ||
} | ||
} | ||
lastParseResult_ = | ||
TripleComponent::Iri::fromIriref(lastParseResult_.getString()); | ||
lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase( | ||
lastParseResult_.getString(), baseForRelativeIri(), | ||
baseForAbsoluteIri()); | ||
return true; | ||
} | ||
} | ||
|
@@ -817,7 +823,19 @@ bool RdfStreamParser<T>::resetStateAndRead( | |
template <class T> | ||
void RdfStreamParser<T>::initialize(const string& filename) { | ||
this->clear(); | ||
fileBuffer_ = std::make_unique<ParallelFileBuffer>(bufferSize_); | ||
// Make sure that a block of data ends with a newline. This is important for | ||
// two reasons: | ||
// | ||
// 1. A block of data must not end in the middle of a comment. Otherwise the | ||
// remaining part of the comment, which is prepended to the next block, is | ||
// not recognized as a comment. | ||
// | ||
// 2. A block of data must not end with a `.` (without subsequent newline). | ||
// The reason is that with a `.` at the end, we cannot decide whether we are | ||
// in the middle of a `PN_LOCAL` (that continues in the next buffer) or at the | ||
// end of a statement. | ||
fileBuffer_ = | ||
std::make_unique<ParallelBufferWithEndRegex>(bufferSize_, "([\\r\\n]+)"); | ||
fileBuffer_->open(filename); | ||
byteVec_.resize(bufferSize_); | ||
// decompress the first block and initialize Tokenizer | ||
|
@@ -847,8 +865,6 @@ bool RdfStreamParser<T>::getLineImpl(TurtleTriple* triple) { | |
// immediately rethrown. If we are reading from a stream in chunks of | ||
// bytes, we can try again with a larger buffer. | ||
try { | ||
// variable parsedStatement will be true iff a statement can | ||
// successfully be parsed | ||
parsedStatement = T::statement(); | ||
} catch (const typename T::ParseException& p) { | ||
parsedStatement = false; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.