Skip to content

Commit

Permalink
First step towards a correct implementation of BASE
Browse files Browse the repository at this point in the history
When a BASE IRI is defined, relative IRIs should be resolved against it.
For example, with `BASE <http://example.org/>`, the relative IRI `<a>`
should be resolved to `<http://example.org/a>`.

This is a fist step in this direction. So far, most of the test fail
because they use relative IRIs extensively, assuming that they are
left untouched.

PS: UniProt uses BASE extensively, so this is a blocker for parsing
UniProt via Turtle.
  • Loading branch information
Hannah Bast committed Dec 5, 2024
1 parent ec806f0 commit 62f7f96
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ constexpr inline size_t PARSER_MIN_TRIPLES_AT_ONCE = 10'000;

// When reading from a file, Chunks of this size will
// be fed to the parser at once (10 MiB).
constinit inline std::atomic<size_t> FILE_BUFFER_SIZE = 10 * (1ul << 20);
constinit inline std::atomic<size_t> FILE_BUFFER_SIZE = 100 * (1ul << 20);

constinit inline std::atomic<size_t> BUFFER_SIZE_JOIN_PATTERNS_WITH_OSP =
50'000;
Expand Down
27 changes: 21 additions & 6 deletions src/parser/RdfParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ template <class T>
bool TurtleParser<T>::base() {
if (skip<TurtleTokenId::TurtleBase>()) {
if (iriref() && check(skip<TurtleTokenId::Dot>())) {
prefixMap_[""] = lastParseResult_.getIri();
prefixMap_["_BASE"] = lastParseResult_.getIri();
return true;
} else {
raise("Parsing @base definition failed");
Expand Down Expand Up @@ -85,7 +85,7 @@ template <class T>
bool TurtleParser<T>::sparqlBase() {
if (skip<TurtleTokenId::SparqlBase>()) {
if (iriref()) {
prefixMap_[""] = lastParseResult_.getIri();
prefixMap_["_BASE"] = lastParseResult_.getIri();
return true;
} else {
raise("Parsing BASE definition failed");
Expand Down Expand Up @@ -740,13 +740,29 @@ bool TurtleParser<T>::iriref() {
"Unterminated IRI reference (found '<' but no '>' before "
"one of the following characters: <, \", newline)");
}
// Helper lambda that calls `fromIriref` for absolute IRIs and
// `fromPrefixAndSuffix` with the BASE prefix for relative IRIs.
auto makeIri = [this](std::string_view iri) {
if (iri.find("://") != std::string_view::npos || iri.size() == 2) {
return TripleComponent::Iri::fromIriref(iri);
} else {
AD_CORRECTNESS_CHECK(iri.size() >= 2);
AD_CORRECTNESS_CHECK(iri[0] == '<' && iri[iri.size() - 1] == '>');
auto basePrefix = prefixMap_.find("_BASE");
if (basePrefix == prefixMap_.end()) {
raise(absl::StrCat("Relative IRI reference `", iri,
"` found but no base IRI defined"));
}
return TripleComponent::Iri::fromPrefixAndSuffix(
basePrefix->second, iri.substr(1, iri.size() - 2));
}
};
// In relaxed mode, that is all we check. Otherwise, we check if the IRI is
// standard-compliant. If not, we output a warning and try to parse it in a
// more relaxed way.
if constexpr (UseRelaxedParsing) {
tok_.remove_prefix(endPos + 1);
lastParseResult_ =
TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1));
lastParseResult_ = makeIri(view.substr(0, endPos + 1));
return true;
} else {
if (!parseTerminal<TurtleTokenId::Iriref>()) {
Expand All @@ -756,8 +772,7 @@ bool TurtleParser<T>::iriref() {
return false;
}
}
lastParseResult_ =
TripleComponent::Iri::fromIriref(lastParseResult_.getString());
lastParseResult_ = makeIri(lastParseResult_.getString());
return true;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/parser/RdfParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class TurtleParser : public RdfParserBase {
// `TripleComponent` since it can hold any parsing result, not only objects.
TripleComponent lastParseResult_;

// Maps prefixes to their expanded form, initialized with the empty base
// Maps prefixes to their expanded form, initialized with the empty prefix
// (i.e. the prefix ":" maps to the empty IRI).
ad_utility::HashMap<std::string, TripleComponent::Iri> prefixMap_{{{}, {}}};

Expand Down
1 change: 1 addition & 0 deletions test/util/IndexTestHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ Index makeTestIndex(const std::string& indexBasename,
std::string inputFilename = indexBasename + ".ttl";
if (!turtleInput.has_value()) {
turtleInput =
"BASE <> "
"<x> <label> \"alpha\" . <x> <label> \"älpha\" . <x> <label> \"A\" . "
"<x> "
"<label> \"Beta\". <x> <is-a> <y>. <y> <is-a> <x>. <z> <label> "
Expand Down

0 comments on commit 62f7f96

Please sign in to comment.