From 6799a37925f8d4eb00c5d1b0c0a75747f75fb948 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 24 Jan 2024 06:09:27 +0100 Subject: [PATCH] Have the `.tmp` files in the index directory Reason: The merge was very SLOW when these were in the vocabulary directory, which for our UniProt index builds is on HDD (because the external vocabulary is so larger). I first tried to only have the `.tmp.partial-vocabulary.words` files in the index directory, but that was still slow. Now also the `.tmp.partial-vocabulary.ids` files are in the index directory. Explanations concerning SLOW: The merging of the first few 100M triples is fast (30 seconds per 100M triples). Then it becomes slow and then very slow (half an hour from 700M triples to 800M triples). Not only is it slow, but doing other stuff on the machine (like wrting something in an editor with autosave on) becomes very slow to respond to, which is a clear sign that the random accesses to HDD are the problem. NOTE: With the partial solution, where `.tmp.partial-vocabulary.words` are on SSD and `.tmp.partial-vocabulary.ids` are on HDD, it is not as bad. There was a very significant slow-down from 700M to 1100M triples, but after that merging was as fast again (though not as fast as in the beginning). At the time of this writing, I only observed until 1700M, stay tuned for more information. --- src/VocabularyMergerMain.cpp | 2 +- src/index/ConstantsIndexBuilding.h | 5 +++-- src/index/IndexImpl.cpp | 18 +++++++++--------- src/index/VocabularyGenerator.h | 3 ++- src/index/VocabularyGeneratorImpl.h | 13 +++++++------ test/VocabularyGeneratorTest.cpp | 9 +++++---- 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp index 279b266827..03301da8e8 100644 --- a/src/VocabularyMergerMain.cpp +++ b/src/VocabularyMergerMain.cpp @@ -24,6 +24,6 @@ int main(int argc, char** argv) { auto internalVocabularyAction = [&file](const auto& word) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - m.mergeVocabulary(basename, numFiles, TripleComponentComparator(), + m.mergeVocabulary(basename, basename, numFiles, TripleComponentComparator(), internalVocabularyAction, 4_GB); } diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 604b5ada77..87a0cca2ba 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -50,8 +50,9 @@ static const size_t BZIP2_MAX_TOTAL_BUFFER_SIZE = 1 << 30; static const size_t THRESHOLD_RELATION_CREATION = 2 << 20; // ________________________________________________________________ -static const std::string PARTIAL_VOCAB_FILE_NAME = ".tmp.partial-vocabulary."; -static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-ids-mmap."; +static const std::string PARTIAL_VOCAB_FILE_NAME = + ".tmp.partial-vocabulary.words."; +static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-vocabulary.ids."; // ________________________________________________________________ static const std::string TMP_BASENAME_COMPRESSION = diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 713279726d..6578936803 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -476,8 +476,8 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( }; m._noIdMapsAndIgnoreExternalVocab = true; auto mergeResult = m.mergeVocabulary( - onDiskBaseVocabulary_ + TMP_BASENAME_COMPRESSION, numFiles, - std::less<>(), internalVocabularyActionCompression, + onDiskBaseIndex_, onDiskBaseVocabulary_ + TMP_BASENAME_COMPRESSION, + numFiles, std::less<>(), internalVocabularyActionCompression, memoryLimitIndexBuilding()); sizeInternalVocabulary = mergeResult.numWordsTotal_; LOG(INFO) << "Number of words in internal vocabulary: " @@ -508,8 +508,8 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto internalVocabularyAction = [&wordWriter](const auto& word) { wordWriter.push(word.data(), word.size()); }; - return v.mergeVocabulary(onDiskBaseVocabulary_, numFiles, sortPred, - internalVocabularyAction, + return v.mergeVocabulary(onDiskBaseIndex_, onDiskBaseVocabulary_, numFiles, + sortPred, internalVocabularyAction, memoryLimitIndexBuilding()); }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; @@ -526,9 +526,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( LOG(INFO) << "Removing temporary files ..." << std::endl; for (size_t i = 0; i < numFiles; ++i) { deleteTemporaryFile( - absl::StrCat(onDiskBaseVocabulary_, PARTIAL_VOCAB_FILE_NAME, i)); + absl::StrCat(onDiskBaseIndex_, PARTIAL_VOCAB_FILE_NAME, i)); if (vocabPrefixCompressed_) { - deleteTemporaryFile(absl::StrCat(onDiskBaseVocabulary_, + deleteTemporaryFile(absl::StrCat(onDiskBaseIndex_, TMP_BASENAME_COMPRESSION, PARTIAL_VOCAB_FILE_NAME, i)); } @@ -629,7 +629,7 @@ IndexImpl::convertPartialToGlobalIds( return std::nullopt; } std::string mmapFilename = - absl::StrCat(onDiskBaseVocabulary_, PARTIAL_MMAP_IDS, idx); + absl::StrCat(onDiskBaseIndex_, PARTIAL_MMAP_IDS, idx); auto map = IdMapFromPartialIdMapFile(mmapFilename); // Delete the temporary file in which we stored this map deleteTemporaryFile(mmapFilename); @@ -1219,9 +1219,9 @@ std::future IndexImpl::writeNextPartialVocabulary( << actualCurrentPartialSize << std::endl; std::future resultFuture; string partialFilename = - absl::StrCat(onDiskBaseVocabulary_, PARTIAL_VOCAB_FILE_NAME, numFiles); + absl::StrCat(onDiskBaseIndex_, PARTIAL_VOCAB_FILE_NAME, numFiles); string partialCompressionFilename = - absl::StrCat(onDiskBaseVocabulary_, TMP_BASENAME_COMPRESSION, + absl::StrCat(onDiskBaseIndex_, TMP_BASENAME_COMPRESSION, PARTIAL_VOCAB_FILE_NAME, numFiles); auto lambda = [localIds = std::move(localIds), globalWritePtr, diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h index c8c8172b72..a4777b8a50 100644 --- a/src/index/VocabularyGenerator.h +++ b/src/index/VocabularyGenerator.h @@ -113,7 +113,8 @@ class VocabularyMerger { // This automatically resets the inner members after finishing, to leave the // external interface stateless template - VocabularyMetaData mergeVocabulary(const std::string& fileIdx, + VocabularyMetaData mergeVocabulary(const std::string& baseNameIndex, + const std::string& baseNameVocabulary, size_t numFiles, Comp comparator, InternalVocabularyAction& action, ad_utility::MemorySize memToUse); diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h index dfc1c8c9dd..c2328a324b 100644 --- a/src/index/VocabularyGeneratorImpl.h +++ b/src/index/VocabularyGeneratorImpl.h @@ -30,8 +30,9 @@ // ___________________________________________________________________ template VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( - const std::string& baseNameExternalVocabulary, size_t numFiles, - Comparator comparator, InternalVocabularyAction& internalVocabularyAction, + const std::string& baseNameIndex, const std::string& baseNameVocabulary, + size_t numFiles, Comparator comparator, + InternalVocabularyAction& internalVocabularyAction, ad_utility::MemorySize memoryToUse) { // Return true iff p1 >= p2 according to the lexicographic order of the IRI // or literal. All internal IRIs or literals come before all external ones. @@ -52,8 +53,8 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( std::vector> generators; auto makeGenerator = [&](size_t fileIdx) -> cppcoro::generator { - ad_utility::serialization::FileReadSerializer infile{absl::StrCat( - baseNameExternalVocabulary, PARTIAL_VOCAB_FILE_NAME, fileIdx)}; + ad_utility::serialization::FileReadSerializer infile{ + absl::StrCat(baseNameIndex, PARTIAL_VOCAB_FILE_NAME, fileIdx)}; uint64_t numWords; infile >> numWords; TripleComponentWithIndex val; @@ -64,7 +65,7 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( } }; if (!_noIdMapsAndIgnoreExternalVocab) { - outfileExternal_ = ad_utility::makeOfstream(baseNameExternalVocabulary + + outfileExternal_ = ad_utility::makeOfstream(baseNameVocabulary + EXTERNAL_LITS_TEXT_FILE_NAME); } @@ -74,7 +75,7 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( generators.push_back(makeGenerator(i)); if (!_noIdMapsAndIgnoreExternalVocab) { idVecs_.emplace_back( - 0, baseNameExternalVocabulary + PARTIAL_MMAP_IDS + std::to_string(i)); + 0, baseNameIndex + PARTIAL_MMAP_IDS + std::to_string(i)); } } diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp index 3936475b52..af22757215 100644 --- a/test/VocabularyGeneratorTest.cpp +++ b/test/VocabularyGeneratorTest.cpp @@ -172,8 +172,9 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) { auto internalVocabularyAction = [&file](const auto& word) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(), - internalVocabularyAction, 1_GB); + res = + m.mergeVocabulary(_basePath, _basePath, 2, TripleComponentComparator(), + internalVocabularyAction, 1_GB); } // No language tags in text file @@ -222,7 +223,7 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { auto internalVocabularyAction = [&file](const auto& word) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - m.mergeVocabulary(basename, 1, v.getCaseComparator(), + m.mergeVocabulary(basename, basename, 1, v.getCaseComparator(), internalVocabularyAction, 1_GB); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0"); @@ -274,7 +275,7 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { auto internalVocabularyAction = [&file](const auto& word) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - m.mergeVocabulary(basename, 1, v.getCaseComparator(), + m.mergeVocabulary(basename, basename, 1, v.getCaseComparator(), internalVocabularyAction, 1_GB); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0");