BASE declarations are now handled correctly (#1660)

When a `BASE` IRI is defined, IRIs without a scheme (like `<http://...>`) should be resolved. For example, with `BASE <http://purl.uniprot.org/uniprot/>`, the relative IRI `<UPI001AF4585D>` should be resolved to `<http://purl.uniprot.org/uniprot/UPI001AF4585D>`, and the absolute IRI `</prosite/PS51927>` should be resolved to `<http://purl.uniprot.org/prosite/PS51927>`. NOTE: Without `BASE` declaration, relative IRIs like `<a>` are accepted and left unchanged. This is used extensively in our tests. This is not 100% compliant with the SPARQL standard, but we consider being slightly more accepting than the standard harmless in this case.
ad-freiburg · Dec 10, 2024 · 0400f90 · 0400f90
1 parent 01d8306
commit 0400f90
Show file tree

Hide file tree

Showing 8 changed files with 272 additions and 43 deletions.
diff --git a/src/parser/Iri.cpp b/src/parser/Iri.cpp
@@ -1,6 +1,7 @@
-// Copyright 2023, University of Freiburg,
-//                 Chair of Algorithms and Data Structures.
-// Author: Benedikt Maria Beckermann <[email protected]>
+// Copyright 2023 - 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Benedikt Maria Beckermann <[email protected]>
+//          Hannah Bast <[email protected]>
 
 #include "parser/Iri.h"
 
@@ -47,6 +48,60 @@ Iri Iri::fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix) {
   return Iri{prefix, asNormalizedStringViewUnsafe(suffixNormalized)};
 }
 
+// ____________________________________________________________________________
+Iri Iri::getBaseIri(bool domainOnly) const {
+  AD_CORRECTNESS_CHECK(iri_.starts_with('<') && iri_.ends_with('>'), iri_);
+  // Check if we have a scheme and find the first `/` after that (or the first
+  // `/` at all if there is no scheme).
+  size_t pos = iri_.find(schemePattern);
+  if (pos == std::string::npos) {
+    LOG(WARN) << "No scheme found in base IRI: \"" << iri_ << "\""
+              << " (but we accept it anyway)" << std::endl;
+    pos = 1;
+  } else {
+    pos += schemePattern.size();
+  }
+  pos = iri_.find('/', pos);
+  // Return the IRI with `/` appended in the following two cases: the IRI has
+  // the empty path, or `domainOnly` is false and the final `/` is missing.
+  if (pos == std::string::npos ||
+      (!domainOnly && iri_[iri_.size() - 2] != '/')) {
+    return fromIrirefWithoutBrackets(
+        absl::StrCat(std::string_view(iri_).substr(1, iri_.size() - 2), "/"sv));
+  }
+  // If `domainOnly` is true, remove the path part.
+  if (domainOnly) {
+    return fromIrirefWithoutBrackets(std::string_view(iri_).substr(1, pos));
+  }
+  // Otherwise, return the IRI as is.
+  return *this;
+}
+
+// ____________________________________________________________________________
+Iri Iri::fromIrirefConsiderBase(std::string_view iriStringWithBrackets,
+                                const Iri& basePrefixForRelativeIris,
+                                const Iri& basePrefixForAbsoluteIris) {
+  auto iriSv = iriStringWithBrackets;
+  AD_CORRECTNESS_CHECK(iriSv.size() >= 2);
+  AD_CORRECTNESS_CHECK(iriSv[0] == '<' && iriSv[iriSv.size() - 1] == '>');
+  if (iriSv.find("://") != std::string_view::npos ||
+      basePrefixForAbsoluteIris.empty()) {
+    // Case 1: IRI with scheme (like `<http://...>`) or `BASE_IRI_FOR_TESTING`
+    // (which is `<@>`, and no valid base IRI has length 3).
+    return TripleComponent::Iri::fromIriref(iriSv);
+  } else if (iriSv[1] == '/') {
+    // Case 2: Absolute IRI without scheme (like `</prosite/PS51927>`).
+    AD_CORRECTNESS_CHECK(!basePrefixForAbsoluteIris.empty());
+    return TripleComponent::Iri::fromPrefixAndSuffix(
+        basePrefixForAbsoluteIris, iriSv.substr(2, iriSv.size() - 3));
+  } else {
+    // Case 3: Relative IRI (like `<UPI001AF4585D>`).
+    AD_CORRECTNESS_CHECK(!basePrefixForRelativeIris.empty());
+    return TripleComponent::Iri::fromPrefixAndSuffix(
+        basePrefixForRelativeIris, iriSv.substr(1, iriSv.size() - 2));
+  }
+}
+
 // ____________________________________________________________________________
 Iri Iri::fromStringRepresentation(std::string s) {
   AD_CORRECTNESS_CHECK(s.starts_with("<") || s.starts_with("@"));

diff --git a/src/parser/Iri.h b/src/parser/Iri.h
@@ -1,6 +1,7 @@
-// Copyright 2023, University of Freiburg,
-//                 Chair of Algorithms and Data Structures.
-// Author: Benedikt Maria Beckermann <[email protected]>
+// Copyright 2023 - 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Benedikt Maria Beckermann <[email protected]>
+//          Hannah Bast <[email protected]>
 
 #pragma once
 
@@ -14,15 +15,18 @@ namespace ad_utility::triple_component {
 class Iri {
  private:
   // Store the string value of the IRI including the angle brackets.
-  // brackets.
   std::string iri_;
 
-  // Create a new iri object
+  // Create a new `Iri` object
   explicit Iri(std::string iri);
 
-  // Create a new iri using a prefix
+  // Create a new `Iri` using a prefix
   Iri(const Iri& prefix, NormalizedStringView suffix);
 
+  // Pattern used to identify the scheme in an IRI. Note that we do not
+  // check the validity of the part before the `://` according to RFC 3987.
+  static constexpr std::string_view schemePattern = "://";
+
  public:
   // A default constructed IRI is empty.
   Iri() = default;
@@ -36,15 +40,32 @@ class Iri {
   const std::string& toStringRepresentation() const;
   std::string& toStringRepresentation();
 
-  // Create a new `Iri` given an iri string with brackets.
+  // Create a new `Ìri` given an IRI string with brackets.
   static Iri fromIriref(std::string_view stringWithBrackets);
 
-  // Create a new `Iri` given an iri string without brackets.
+  // Create a new `Iri` given an IRI string without brackets.
   static Iri fromIrirefWithoutBrackets(std::string_view stringWithoutBrackets);
 
-  // Create a new iri given a prefix iri and its suffix
+  // Create a new `Iri` given a prefix IRI and its suffix
   static Iri fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix);
 
+  // Create a new `Iri` object, considering the base IRI. For IRIs with a scheme
+  // (like `<http://...>`), this is the same as `fromIriref`. For IRIs without a
+  // scheme, prepend the base prefix for relative IRIs (like `<UPI001AF4585D>`)
+  // or for absolute IRIs (like `</prosite/PS51927>`).
+  static Iri fromIrirefConsiderBase(std::string_view iriStringWithBrackets,
+                                    const Iri& basePrefixForRelativeIris,
+                                    const Iri& basePrefixForAbsoluteIris);
+
+  // Get the base IRI from this `Iri` object. The returned `Iri` always has a
+  // `/` at the end. If `domainOnly` is true, remove the path part, for
+  // example, for `<http://purl.uniprot.org/uniprot/>` the method returns
+  // `<http://purl.uniprot.org/>`.
+  Iri getBaseIri(bool domainOnly) const;
+
+  // Return true iff the IRI is empty.
+  bool empty() const { return iri_.empty(); }
+
   // Return the string value of the iri object without any leading or trailing
   // angled brackets.
   NormalizedStringView getContent() const;

diff --git a/src/parser/ParallelBuffer.cpp b/src/parser/ParallelBuffer.cpp
@@ -72,7 +72,12 @@ std::optional<size_t> ParallelBufferWithEndRegex::findRegexNearEnd(
 // _____________________________________________________________________________
 std::optional<ParallelBuffer::BufferType>
 ParallelBufferWithEndRegex::getNextBlock() {
+  // Get the block of data read asynchronously after the previous call
+  // to `getNextBlock`.
   auto rawInput = rawBuffer_.getNextBlock();
+
+  // If there was no more data, return the remainder or `std::nullopt` if
+  // it is empty.
   if (!rawInput || exhausted_) {
     exhausted_ = true;
     if (remainder_.empty()) {
@@ -85,7 +90,15 @@ ParallelBufferWithEndRegex::getNextBlock() {
     return copy;
   }
 
+  // Find `endRegex_` in the data (searching from the back, in chunks of
+  // exponentially increasing size). Note that this does not necessarily
+  // find the last match of `endRegex_` in the data, but the first match in the
+  // last chunk (from the back), where there is a match.
   auto endPosition = findRegexNearEnd(rawInput.value(), endRegex_);
+
+  // If no match was found at all, report an error, except when this is the
+  // last block (then `getNextBlock` will return `std::nullopt`, and we simply
+  // concatenate it to the remainder).
   if (!endPosition) {
     if (rawBuffer_.getNextBlock()) {
       throw std::runtime_error(absl::StrCat(
@@ -95,10 +108,13 @@ ParallelBufferWithEndRegex::getNextBlock() {
           "increase the FILE_BUFFER_SIZE "
           "or set \"parallel-parsing: false\" in the settings file."));
     }
-    // This was the last (possibly incomplete) block, simply concatenate
     endPosition = rawInput->size();
     exhausted_ = true;
   }
+
+  // Concatenate the remainder (part after `endRegex_`) of the block from the
+  // previous round with the part of the block until `endRegex_` from this
+  // round.
   BufferType result;
   result.reserve(remainder_.size() + *endPosition);
   result.insert(result.end(), remainder_.begin(), remainder_.end());

diff --git a/src/parser/ParallelBuffer.h b/src/parser/ParallelBuffer.h
@@ -76,17 +76,19 @@ class ParallelFileBuffer : public ParallelBuffer {
   std::future<size_t> fut_;
 };
 
-/// A parallel buffer, where each of the blocks except for the last one has to
-/// end with a certain regex (e.g. a full stop followed by whitespace and a
-/// newline to denote the end of a triple in a .ttl file).
+// A parallel buffer that reads input from the file in blocks, where each block,
+// except possibly the last, ends with `endRegex`.
 class ParallelBufferWithEndRegex : public ParallelBuffer {
  public:
   ParallelBufferWithEndRegex(size_t blocksize, std::string endRegex)
       : ParallelBuffer{blocksize},
         endRegex_{endRegex},
         endRegexAsString_{std::move(endRegex)} {}
 
-  // __________________________________________________________________________
+  // Get the data that was read asynchronously after the previous call to this
+  // function. Returns the part of the data until `endRegex` is found, with the
+  // part after `endRegex` from the previous call prepended. If `endRegex` is
+  // not found, simply return the rest of the data.
   std::optional<BufferType> getNextBlock() override;
 
   // Open the file from which the blocks are read.

diff --git a/src/parser/RdfParser.cpp b/src/parser/RdfParser.cpp
@@ -1,6 +1,7 @@
-// Copyright 2018, University of Freiburg,
-// Chair of Algorithms and Data Structures.
-// Author: Johannes Kalmbach(joka921) <[email protected]>
+// Copyright 2018 - 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Johannes Kalmbach <[email protected]>
+//          Hannah Bast <[email protected]>
 
 #include "parser/RdfParser.h"
 
@@ -55,7 +56,9 @@ template <class T>
 bool TurtleParser<T>::base() {
   if (skip<TurtleTokenId::TurtleBase>()) {
     if (iriref() && check(skip<TurtleTokenId::Dot>())) {
-      prefixMap_[""] = lastParseResult_.getIri();
+      const auto& iri = lastParseResult_.getIri();
+      prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false);
+      prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true);
       return true;
     } else {
       raise("Parsing @base definition failed");
@@ -85,7 +88,9 @@ template <class T>
 bool TurtleParser<T>::sparqlBase() {
   if (skip<TurtleTokenId::SparqlBase>()) {
     if (iriref()) {
-      prefixMap_[""] = lastParseResult_.getIri();
+      auto iri = lastParseResult_.getIri();
+      prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false);
+      prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true);
       return true;
     } else {
       raise("Parsing BASE definition failed");
@@ -745,8 +750,8 @@ bool TurtleParser<T>::iriref() {
   // more relaxed way.
   if constexpr (UseRelaxedParsing) {
     tok_.remove_prefix(endPos + 1);
-    lastParseResult_ =
-        TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1));
+    lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase(
+        view.substr(0, endPos + 1), baseForRelativeIri(), baseForAbsoluteIri());
     return true;
   } else {
     if (!parseTerminal<TurtleTokenId::Iriref>()) {
@@ -756,8 +761,9 @@ bool TurtleParser<T>::iriref() {
         return false;
       }
     }
-    lastParseResult_ =
-        TripleComponent::Iri::fromIriref(lastParseResult_.getString());
+    lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase(
+        lastParseResult_.getString(), baseForRelativeIri(),
+        baseForAbsoluteIri());
     return true;
   }
 }
@@ -817,7 +823,19 @@ bool RdfStreamParser<T>::resetStateAndRead(
 template <class T>
 void RdfStreamParser<T>::initialize(const string& filename) {
   this->clear();
-  fileBuffer_ = std::make_unique<ParallelFileBuffer>(bufferSize_);
+  // Make sure that a block of data ends with a newline. This is important for
+  // two reasons:
+  //
+  // 1. A block of data must not end in the middle of a comment. Otherwise the
+  // remaining part of the comment, which is prepended to the next block, is
+  // not recognized as a comment.
+  //
+  // 2. A block of data must not end with a `.` (without subsequent newline).
+  // The reason is that with a `.` at the end, we cannot decide whether we are
+  // in the middle of a `PN_LOCAL` (that continues in the next buffer) or at the
+  // end of a statement.
+  fileBuffer_ =
+      std::make_unique<ParallelBufferWithEndRegex>(bufferSize_, "([\\r\\n]+)");
   fileBuffer_->open(filename);
   byteVec_.resize(bufferSize_);
   // decompress the first block and initialize Tokenizer
@@ -847,8 +865,6 @@ bool RdfStreamParser<T>::getLineImpl(TurtleTriple* triple) {
       // immediately rethrown. If we are reading from a stream in chunks of
       // bytes, we can try again with a larger buffer.
       try {
-        // variable parsedStatement will be true iff a statement can
-        // successfully be parsed
         parsedStatement = T::statement();
       } catch (const typename T::ParseException& p) {
         parsedStatement = false;

diff --git a/src/parser/RdfParser.h b/src/parser/RdfParser.h
@@ -169,6 +169,12 @@ class TurtleParser : public RdfParserBase {
   static constexpr std::array<const char*, 3> floatDatatypes_ = {
       XSD_DECIMAL_TYPE, XSD_DOUBLE_TYPE, XSD_FLOAT_TYPE};
 
+  // The keys for storing the base prefix (for relative and absolute IRIs) in
+  // the prefix map. The only thing that is important about these keys is that
+  // they are different from each other and from any valid prefix name.
+  static constexpr const char* baseForRelativeIriKey_ = "@";
+  static constexpr const char* baseForAbsoluteIriKey_ = "@@";
+
  protected:
   // Data members.
 
@@ -187,9 +193,23 @@ class TurtleParser : public RdfParserBase {
   // `TripleComponent` since it can hold any parsing result, not only objects.
   TripleComponent lastParseResult_;
 
-  // Maps prefixes to their expanded form, initialized with the empty base
-  // (i.e. the prefix ":" maps to the empty IRI).
-  ad_utility::HashMap<std::string, TripleComponent::Iri> prefixMap_{{{}, {}}};
+  // Map that maps prefix names to their IRI. For our tests, it is important
+  // that without any BASE declaration, the two base prefixes are mapped to the
+  // empty IRI.
+  static const inline ad_utility::HashMap<std::string, TripleComponent::Iri>
+      prefixMapDefault_{{baseForRelativeIriKey_, TripleComponent::Iri{}},
+                        {baseForAbsoluteIriKey_, TripleComponent::Iri{}}};
+  ad_utility::HashMap<std::string, TripleComponent::Iri> prefixMap_ =
+      prefixMapDefault_;
+
+  // Getters for the two base prefixes. Without BASE declaration, these will
+  // both return the empty IRI.
+  const TripleComponent::Iri& baseForRelativeIri() {
+    return prefixMap_.at(baseForRelativeIriKey_);
+  }
+  const TripleComponent::Iri& baseForAbsoluteIri() {
+    return prefixMap_.at(baseForAbsoluteIriKey_);
+  }
 
   // There are turtle constructs that reuse prefixes, subjects and predicates
   // so we have to save the last seen ones.
@@ -222,7 +242,7 @@ class TurtleParser : public RdfParserBase {
     activePredicate_ = TripleComponent::Iri::fromIriref("<>");
     activePrefix_.clear();
 
-    prefixMap_.clear();
+    prefixMap_ = prefixMapDefault_;
 
     tok_.reset(nullptr, 0);
     triples_.clear();
@@ -400,6 +420,8 @@ class TurtleParser : public RdfParserBase {
   FRIEND_TEST(RdfParserTest, predicateObjectList);
   FRIEND_TEST(RdfParserTest, objectList);
   FRIEND_TEST(RdfParserTest, object);
+  FRIEND_TEST(RdfParserTest, base);
+  FRIEND_TEST(RdfParserTest, sparqlBase);
   FRIEND_TEST(RdfParserTest, blankNode);
   FRIEND_TEST(RdfParserTest, blankNodePropertyList);
   FRIEND_TEST(RdfParserTest, numericLiteral);
@@ -516,8 +538,6 @@ class RdfStringParser : public Parser {
     this->tok_.reset(tmpToParse_.data(), tmpToParse_.size());
   }
 
-  void setPrefixMap(decltype(prefixMap_) m) { prefixMap_ = std::move(m); }
-
   const auto& getPrefixMap() const { return prefixMap_; }
 
   // __________________________________________________________
@@ -604,10 +624,10 @@ class RdfStreamParser : public Parser {
   // that's why we need the backupState() and resetStateAndRead() methods
   ParallelBuffer::BufferType byteVec_;
 
-  std::unique_ptr<ParallelBuffer> fileBuffer_;
+  size_t bufferSize_ = FILE_BUFFER_SIZE;
+  std::unique_ptr<ParallelBufferWithEndRegex> fileBuffer_;
   // this many characters will be buffered at once,
   // defaults to a global constant
-  size_t bufferSize_ = FILE_BUFFER_SIZE;
 
   // that many bytes were already parsed before dealing with the current batch
   // in member byteVec_