From 6799a37925f8d4eb00c5d1b0c0a75747f75fb948 Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Wed, 24 Jan 2024 06:09:27 +0100
Subject: [PATCH] Have the `.tmp` files in the index directory

Reason: The merge was very SLOW when these were in the vocabulary
directory, which for our UniProt index builds is on HDD (because the
external vocabulary is so larger). I first tried to only have the
`.tmp.partial-vocabulary.words` files in the index directory, but that
was still slow. Now also the `.tmp.partial-vocabulary.ids` files are in
the index directory.

Explanations concerning SLOW: The merging of the first few 100M triples
is fast (30 seconds per 100M triples). Then it becomes slow and then
very slow (half an hour from 700M triples to 800M triples). Not only is
it slow, but doing other stuff on the machine (like wrting something in
an editor with autosave on) becomes very slow to respond to, which is a
clear sign that the random accesses to HDD are the problem.

NOTE: With the partial solution, where `.tmp.partial-vocabulary.words`
are on SSD and `.tmp.partial-vocabulary.ids` are on HDD, it is not as
bad. There was a very significant slow-down from 700M to 1100M triples,
but after that merging was as fast again (though not as fast as in the
beginning). At the time of this writing, I only observed until 1700M,
stay tuned for more information.
---
 src/VocabularyMergerMain.cpp        |  2 +-
 src/index/ConstantsIndexBuilding.h  |  5 +++--
 src/index/IndexImpl.cpp             | 18 +++++++++---------
 src/index/VocabularyGenerator.h     |  3 ++-
 src/index/VocabularyGeneratorImpl.h | 13 +++++++------
 test/VocabularyGeneratorTest.cpp    |  9 +++++----
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp
index 279b266827..03301da8e8 100644
--- a/src/VocabularyMergerMain.cpp
+++ b/src/VocabularyMergerMain.cpp
@@ -24,6 +24,6 @@ int main(int argc, char** argv) {
   auto internalVocabularyAction = [&file](const auto& word) {
     file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
   };
-  m.mergeVocabulary(basename, numFiles, TripleComponentComparator(),
+  m.mergeVocabulary(basename, basename, numFiles, TripleComponentComparator(),
                     internalVocabularyAction, 4_GB);
 }
diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
index 604b5ada77..87a0cca2ba 100644
--- a/src/index/ConstantsIndexBuilding.h
+++ b/src/index/ConstantsIndexBuilding.h
@@ -50,8 +50,9 @@ static const size_t BZIP2_MAX_TOTAL_BUFFER_SIZE = 1 << 30;
 static const size_t THRESHOLD_RELATION_CREATION = 2 << 20;
 
 // ________________________________________________________________
-static const std::string PARTIAL_VOCAB_FILE_NAME = ".tmp.partial-vocabulary.";
-static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-ids-mmap.";
+static const std::string PARTIAL_VOCAB_FILE_NAME =
+    ".tmp.partial-vocabulary.words.";
+static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-vocabulary.ids.";
 
 // ________________________________________________________________
 static const std::string TMP_BASENAME_COMPRESSION =
diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
index 713279726d..6578936803 100644
--- a/src/index/IndexImpl.cpp
+++ b/src/index/IndexImpl.cpp
@@ -476,8 +476,8 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
         };
     m._noIdMapsAndIgnoreExternalVocab = true;
     auto mergeResult = m.mergeVocabulary(
-        onDiskBaseVocabulary_ + TMP_BASENAME_COMPRESSION, numFiles,
-        std::less<>(), internalVocabularyActionCompression,
+        onDiskBaseIndex_, onDiskBaseVocabulary_ + TMP_BASENAME_COMPRESSION,
+        numFiles, std::less<>(), internalVocabularyActionCompression,
         memoryLimitIndexBuilding());
     sizeInternalVocabulary = mergeResult.numWordsTotal_;
     LOG(INFO) << "Number of words in internal vocabulary: "
@@ -508,8 +508,8 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
     auto internalVocabularyAction = [&wordWriter](const auto& word) {
       wordWriter.push(word.data(), word.size());
     };
-    return v.mergeVocabulary(onDiskBaseVocabulary_, numFiles, sortPred,
-                             internalVocabularyAction,
+    return v.mergeVocabulary(onDiskBaseIndex_, onDiskBaseVocabulary_, numFiles,
+                             sortPred, internalVocabularyAction,
                              memoryLimitIndexBuilding());
   }();
   LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl;
@@ -526,9 +526,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
   LOG(INFO) << "Removing temporary files ..." << std::endl;
   for (size_t i = 0; i < numFiles; ++i) {
     deleteTemporaryFile(
-        absl::StrCat(onDiskBaseVocabulary_, PARTIAL_VOCAB_FILE_NAME, i));
+        absl::StrCat(onDiskBaseIndex_, PARTIAL_VOCAB_FILE_NAME, i));
     if (vocabPrefixCompressed_) {
-      deleteTemporaryFile(absl::StrCat(onDiskBaseVocabulary_,
+      deleteTemporaryFile(absl::StrCat(onDiskBaseIndex_,
                                        TMP_BASENAME_COMPRESSION,
                                        PARTIAL_VOCAB_FILE_NAME, i));
     }
@@ -629,7 +629,7 @@ IndexImpl::convertPartialToGlobalIds(
       return std::nullopt;
     }
     std::string mmapFilename =
-        absl::StrCat(onDiskBaseVocabulary_, PARTIAL_MMAP_IDS, idx);
+        absl::StrCat(onDiskBaseIndex_, PARTIAL_MMAP_IDS, idx);
     auto map = IdMapFromPartialIdMapFile(mmapFilename);
     // Delete the temporary file in which we stored this map
     deleteTemporaryFile(mmapFilename);
@@ -1219,9 +1219,9 @@ std::future<void> IndexImpl::writeNextPartialVocabulary(
       << actualCurrentPartialSize << std::endl;
   std::future<void> resultFuture;
   string partialFilename =
-      absl::StrCat(onDiskBaseVocabulary_, PARTIAL_VOCAB_FILE_NAME, numFiles);
+      absl::StrCat(onDiskBaseIndex_, PARTIAL_VOCAB_FILE_NAME, numFiles);
   string partialCompressionFilename =
-      absl::StrCat(onDiskBaseVocabulary_, TMP_BASENAME_COMPRESSION,
+      absl::StrCat(onDiskBaseIndex_, TMP_BASENAME_COMPRESSION,
                    PARTIAL_VOCAB_FILE_NAME, numFiles);
 
   auto lambda = [localIds = std::move(localIds), globalWritePtr,
diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h
index c8c8172b72..a4777b8a50 100644
--- a/src/index/VocabularyGenerator.h
+++ b/src/index/VocabularyGenerator.h
@@ -113,7 +113,8 @@ class VocabularyMerger {
   // This automatically resets the inner members after finishing, to leave the
   // external interface stateless
   template <typename Comp, typename InternalVocabularyAction>
-  VocabularyMetaData mergeVocabulary(const std::string& fileIdx,
+  VocabularyMetaData mergeVocabulary(const std::string& baseNameIndex,
+                                     const std::string& baseNameVocabulary,
                                      size_t numFiles, Comp comparator,
                                      InternalVocabularyAction& action,
                                      ad_utility::MemorySize memToUse);
diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h
index dfc1c8c9dd..c2328a324b 100644
--- a/src/index/VocabularyGeneratorImpl.h
+++ b/src/index/VocabularyGeneratorImpl.h
@@ -30,8 +30,9 @@
 // ___________________________________________________________________
 template <typename Comparator, typename InternalVocabularyAction>
 VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary(
-    const std::string& baseNameExternalVocabulary, size_t numFiles,
-    Comparator comparator, InternalVocabularyAction& internalVocabularyAction,
+    const std::string& baseNameIndex, const std::string& baseNameVocabulary,
+    size_t numFiles, Comparator comparator,
+    InternalVocabularyAction& internalVocabularyAction,
     ad_utility::MemorySize memoryToUse) {
   // Return true iff p1 >= p2 according to the lexicographic order of the IRI
   // or literal. All internal IRIs or literals come before all external ones.
@@ -52,8 +53,8 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary(
   std::vector<cppcoro::generator<QueueWord>> generators;
 
   auto makeGenerator = [&](size_t fileIdx) -> cppcoro::generator<QueueWord> {
-    ad_utility::serialization::FileReadSerializer infile{absl::StrCat(
-        baseNameExternalVocabulary, PARTIAL_VOCAB_FILE_NAME, fileIdx)};
+    ad_utility::serialization::FileReadSerializer infile{
+        absl::StrCat(baseNameIndex, PARTIAL_VOCAB_FILE_NAME, fileIdx)};
     uint64_t numWords;
     infile >> numWords;
     TripleComponentWithIndex val;
@@ -64,7 +65,7 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary(
     }
   };
   if (!_noIdMapsAndIgnoreExternalVocab) {
-    outfileExternal_ = ad_utility::makeOfstream(baseNameExternalVocabulary +
+    outfileExternal_ = ad_utility::makeOfstream(baseNameVocabulary +
                                                 EXTERNAL_LITS_TEXT_FILE_NAME);
   }
 
@@ -74,7 +75,7 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary(
     generators.push_back(makeGenerator(i));
     if (!_noIdMapsAndIgnoreExternalVocab) {
       idVecs_.emplace_back(
-          0, baseNameExternalVocabulary + PARTIAL_MMAP_IDS + std::to_string(i));
+          0, baseNameIndex + PARTIAL_MMAP_IDS + std::to_string(i));
     }
   }
 
diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp
index 3936475b52..af22757215 100644
--- a/test/VocabularyGeneratorTest.cpp
+++ b/test/VocabularyGeneratorTest.cpp
@@ -172,8 +172,9 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) {
     auto internalVocabularyAction = [&file](const auto& word) {
       file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
     };
-    res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(),
-                            internalVocabularyAction, 1_GB);
+    res =
+        m.mergeVocabulary(_basePath, _basePath, 2, TripleComponentComparator(),
+                          internalVocabularyAction, 1_GB);
   }
 
   // No language tags in text file
@@ -222,7 +223,7 @@ TEST(VocabularyGenerator, ReadAndWritePartial) {
       auto internalVocabularyAction = [&file](const auto& word) {
         file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
       };
-      m.mergeVocabulary(basename, 1, v.getCaseComparator(),
+      m.mergeVocabulary(basename, basename, 1, v.getCaseComparator(),
                         internalVocabularyAction, 1_GB);
     }
     auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0");
@@ -274,7 +275,7 @@ TEST(VocabularyGenerator, ReadAndWritePartial) {
       auto internalVocabularyAction = [&file](const auto& word) {
         file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
       };
-      m.mergeVocabulary(basename, 1, v.getCaseComparator(),
+      m.mergeVocabulary(basename, basename, 1, v.getCaseComparator(),
                         internalVocabularyAction, 1_GB);
     }
     auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0");