-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Write vocabulary files to separate directory #1237
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -61,7 +61,8 @@ int main(int argc, char** argv) { | |||||
std::locale locWithNumberGrouping(loc, &facet); | ||||||
ad_utility::Log::imbue(locWithNumberGrouping); | ||||||
|
||||||
string baseName; | ||||||
string baseNameIndex; | ||||||
string baseNameVocabulary; | ||||||
string wordsfile; | ||||||
string docsfile; | ||||||
string textIndexName; | ||||||
|
@@ -86,8 +87,11 @@ int main(int argc, char** argv) { | |||||
boostOptions.add_options()(std::forward<Args>(args)...); | ||||||
}; | ||||||
add("help,h", "Produce this help message."); | ||||||
add("index-basename,i", po::value(&baseName)->required(), | ||||||
"The basename of the output files (required)."); | ||||||
add("index-basename,i", po::value(&baseNameIndex)->required(), | ||||||
"The basename of the index files (required)."); | ||||||
add("vocabulary-basename,v", po::value(&baseNameVocabulary), | ||||||
"The basename of the vocabulary files" | ||||||
"(default: same as basename of the index fles)."); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
add("kg-input-file,f", po::value(&inputFile), | ||||||
"The file with the knowledge graph data to be parsed from. If omitted, " | ||||||
"will read from stdin."); | ||||||
|
@@ -152,6 +156,12 @@ int main(int argc, char** argv) { | |||||
index.memoryLimitIndexBuilding() = stxxlMemory.value(); | ||||||
} | ||||||
|
||||||
// If no external vocabulary basename was specified, use the same as the | ||||||
// index basename. | ||||||
if (baseNameVocabulary.empty()) { | ||||||
baseNameVocabulary = baseNameIndex; | ||||||
} | ||||||
Comment on lines
+161
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the |
||||||
|
||||||
// If no text index name was specified, take the part of the wordsfile after | ||||||
// the last slash. | ||||||
if (textIndexName.empty() && !wordsfile.empty()) { | ||||||
|
@@ -170,17 +180,17 @@ int main(int argc, char** argv) { | |||||
|
||||||
try { | ||||||
LOG(TRACE) << "Configuring STXXL..." << std::endl; | ||||||
size_t posOfLastSlash = baseName.rfind('/'); | ||||||
string location = baseName.substr(0, posOfLastSlash + 1); | ||||||
string tail = baseName.substr(posOfLastSlash + 1); | ||||||
size_t posOfLastSlash = baseNameIndex.rfind('/'); | ||||||
string location = baseNameIndex.substr(0, posOfLastSlash + 1); | ||||||
string tail = baseNameIndex.substr(posOfLastSlash + 1); | ||||||
writeStxxlConfigFile(location, tail); | ||||||
string stxxlFileName = getStxxlDiskFileName(location, tail); | ||||||
LOG(TRACE) << "done." << std::endl; | ||||||
|
||||||
index.setKbName(kbIndexName); | ||||||
index.setTextName(textIndexName); | ||||||
index.usePatterns() = !noPatterns; | ||||||
index.setOnDiskBase(baseName); | ||||||
index.setOnDiskBase(baseNameIndex, baseNameVocabulary); | ||||||
index.setKeepTempFiles(keepTemporaryFiles); | ||||||
index.setSettingsFile(settingsFile); | ||||||
index.setPrefixCompression(!noPrefixCompression); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,7 +86,7 @@ void IndexImpl::addTextFromContextFile(const string& contextFile, | |
bool addWordsFromLiterals) { | ||
LOG(INFO) << std::endl; | ||
LOG(INFO) << "Adding text index ..." << std::endl; | ||
string indexFilename = onDiskBase_ + ".text.index"; | ||
string indexFilename = onDiskBaseIndex_ + ".text.index"; | ||
// Either read words from given file or consider each literal as text record | ||
// or both (but at least one of them, otherwise this function is not called). | ||
if (!contextFile.empty()) { | ||
|
@@ -107,14 +107,14 @@ void IndexImpl::addTextFromContextFile(const string& contextFile, | |
LOG(DEBUG) << "Reloading the RDF vocabulary ..." << std::endl; | ||
vocab_ = RdfsVocabulary{}; | ||
readConfiguration(); | ||
vocab_.readFromFile(onDiskBase_ + INTERNAL_VOCAB_SUFFIX, | ||
onDiskBase_ + EXTERNAL_VOCAB_SUFFIX); | ||
vocab_.readFromFile(onDiskBaseVocabulary_ + INTERNAL_VOCAB_SUFFIX, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you make the |
||
onDiskBaseVocabulary_ + EXTERNAL_VOCAB_SUFFIX); | ||
|
||
// Build the text vocabulary (first scan over the text records). | ||
LOG(INFO) << "Building text vocabulary ..." << std::endl; | ||
size_t nofLines = | ||
processWordsForVocabulary(contextFile, addWordsFromLiterals); | ||
textVocab_.writeToFile(onDiskBase_ + ".text.vocabulary"); | ||
textVocab_.writeToFile(onDiskBaseVocabulary_ + ".text.vocabulary"); | ||
|
||
// Build the half-inverted lists (second scan over the text records). | ||
LOG(INFO) << "Building the half-inverted index lists ..." << std::endl; | ||
|
@@ -134,7 +134,7 @@ void IndexImpl::addTextFromContextFile(const string& contextFile, | |
void IndexImpl::buildDocsDB(const string& docsFileName) const { | ||
LOG(INFO) << "Building DocsDB...\n"; | ||
std::ifstream docsFile{docsFileName}; | ||
std::ofstream ofs(onDiskBase_ + ".text.docsDB", std::ios_base::out); | ||
std::ofstream ofs(onDiskBaseIndex_ + ".text.docsDB", std::ios_base::out); | ||
// To avoid excessive use of RAM, | ||
// we write the offsets to and stxxl:vector first; | ||
stxxl::vector<off_t> offsets; | ||
|
@@ -161,7 +161,7 @@ void IndexImpl::buildDocsDB(const string& docsFileName) const { | |
|
||
ofs.close(); | ||
// Now append the tmp file to the docsDB file. | ||
ad_utility::File out(onDiskBase_ + ".text.docsDB", "a"); | ||
ad_utility::File out(onDiskBaseIndex_ + ".text.docsDB", "a"); | ||
for (size_t i = 0; i < offsets.size(); ++i) { | ||
off_t cur = offsets[i]; | ||
out.write(&cur, sizeof(cur)); | ||
|
@@ -173,10 +173,10 @@ void IndexImpl::buildDocsDB(const string& docsFileName) const { | |
// _____________________________________________________________________________ | ||
void IndexImpl::addTextFromOnDiskIndex() { | ||
// Read the text vocabulary (into RAM). | ||
textVocab_.readFromFile(onDiskBase_ + ".text.vocabulary"); | ||
textVocab_.readFromFile(onDiskBaseVocabulary_ + ".text.vocabulary"); | ||
|
||
// Initialize the text index. | ||
std::string textIndexFileName = onDiskBase_ + ".text.index"; | ||
std::string textIndexFileName = onDiskBaseIndex_ + ".text.index"; | ||
LOG(INFO) << "Reading metadata from file " << textIndexFileName << " ..." | ||
<< std::endl; | ||
textIndexFile_.open(textIndexFileName.c_str(), "r"); | ||
|
@@ -194,11 +194,11 @@ void IndexImpl::addTextFromOnDiskIndex() { | |
// without this, but then there is no content to show when a text record | ||
// matches. This is perfectly fine when the text records come from IRIs or | ||
// literals from our RDF vocabulary. | ||
std::string docsDbFileName = onDiskBase_ + ".text.docsDB"; | ||
std::string docsDbFileName = onDiskBaseIndex_ + ".text.docsDB"; | ||
std::ifstream f(docsDbFileName.c_str()); | ||
if (f.good()) { | ||
f.close(); | ||
docsDB_.init(string(onDiskBase_ + ".text.docsDB")); | ||
docsDB_.init(string(onDiskBaseIndex_ + ".text.docsDB")); | ||
LOG(INFO) << "Registered text records: #records = " << docsDB_._size | ||
<< std::endl; | ||
} else { | ||
|
@@ -707,8 +707,8 @@ size_t IndexImpl::writeCodebook(const vector<T>& codebook, | |
|
||
// _____________________________________________________________________________ | ||
void IndexImpl::openTextFileHandle() { | ||
AD_CONTRACT_CHECK(!onDiskBase_.empty()); | ||
textIndexFile_.open(string(onDiskBase_ + ".text.index").c_str(), "r"); | ||
AD_CONTRACT_CHECK(!onDiskBaseIndex_.empty()); | ||
textIndexFile_.open(string(onDiskBaseIndex_ + ".text.index").c_str(), "r"); | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think one thing you could do (the outer interface that interacts with the control script is mostly you area)
is to set up a simple struct
ServerConfig
that is then passed to the constructor as well as therun()
function where they grap their respectively needed arguments. That makes it much easier to add additional arguments.(Probably we need a similar struct
IndexConfig
that then becomes part of the server config for exactly the same reason).Then it will become much easier to add additional arguments.
Are you interested in setting this up as a separate PR, or should I do this?