diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 71d4897878..2656a4f577 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -31,7 +31,7 @@ constexpr inline size_t PARSER_MIN_TRIPLES_AT_ONCE = 10'000; // When reading from a file, Chunks of this size will // be fed to the parser at once (10 MiB). -constinit inline std::atomic FILE_BUFFER_SIZE = 10 * (1ul << 20); +constinit inline std::atomic FILE_BUFFER_SIZE = 100 * (1ul << 20); constinit inline std::atomic BUFFER_SIZE_JOIN_PATTERNS_WITH_OSP = 50'000; diff --git a/src/parser/RdfParser.cpp b/src/parser/RdfParser.cpp index 9cba9f1455..232d25811e 100644 --- a/src/parser/RdfParser.cpp +++ b/src/parser/RdfParser.cpp @@ -55,7 +55,7 @@ template bool TurtleParser::base() { if (skip()) { if (iriref() && check(skip())) { - prefixMap_[""] = lastParseResult_.getIri(); + prefixMap_["_BASE"] = lastParseResult_.getIri(); return true; } else { raise("Parsing @base definition failed"); @@ -85,7 +85,7 @@ template bool TurtleParser::sparqlBase() { if (skip()) { if (iriref()) { - prefixMap_[""] = lastParseResult_.getIri(); + prefixMap_["_BASE"] = lastParseResult_.getIri(); return true; } else { raise("Parsing BASE definition failed"); @@ -740,13 +740,29 @@ bool TurtleParser::iriref() { "Unterminated IRI reference (found '<' but no '>' before " "one of the following characters: <, \", newline)"); } + // Helper lambda that calls `fromIriref` for absolute IRIs and + // `fromPrefixAndSuffix` with the BASE prefix for relative IRIs. + auto makeIri = [this](std::string_view iri) { + if (iri.find("://") != std::string_view::npos || iri.size() == 2) { + return TripleComponent::Iri::fromIriref(iri); + } else { + AD_CORRECTNESS_CHECK(iri.size() >= 2); + AD_CORRECTNESS_CHECK(iri[0] == '<' && iri[iri.size() - 1] == '>'); + auto basePrefix = prefixMap_.find("_BASE"); + if (basePrefix == prefixMap_.end()) { + raise(absl::StrCat("Relative IRI reference `", iri, + "` found but no base IRI defined")); + } + return TripleComponent::Iri::fromPrefixAndSuffix( + basePrefix->second, iri.substr(1, iri.size() - 2)); + } + }; // In relaxed mode, that is all we check. Otherwise, we check if the IRI is // standard-compliant. If not, we output a warning and try to parse it in a // more relaxed way. if constexpr (UseRelaxedParsing) { tok_.remove_prefix(endPos + 1); - lastParseResult_ = - TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1)); + lastParseResult_ = makeIri(view.substr(0, endPos + 1)); return true; } else { if (!parseTerminal()) { @@ -756,8 +772,7 @@ bool TurtleParser::iriref() { return false; } } - lastParseResult_ = - TripleComponent::Iri::fromIriref(lastParseResult_.getString()); + lastParseResult_ = makeIri(lastParseResult_.getString()); return true; } } diff --git a/src/parser/RdfParser.h b/src/parser/RdfParser.h index ca55c61993..8f593c240a 100644 --- a/src/parser/RdfParser.h +++ b/src/parser/RdfParser.h @@ -187,7 +187,7 @@ class TurtleParser : public RdfParserBase { // `TripleComponent` since it can hold any parsing result, not only objects. TripleComponent lastParseResult_; - // Maps prefixes to their expanded form, initialized with the empty base + // Maps prefixes to their expanded form, initialized with the empty prefix // (i.e. the prefix ":" maps to the empty IRI). ad_utility::HashMap prefixMap_{{{}, {}}}; diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 92d665a491..ebab28df1f 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -142,6 +142,7 @@ Index makeTestIndex(const std::string& indexBasename, std::string inputFilename = indexBasename + ".ttl"; if (!turtleInput.has_value()) { turtleInput = + "BASE <> " "