diff --git a/simple_ir/Dic.cpp b/simple_ir/Dic.cpp index 791279f..8ee1302 100644 --- a/simple_ir/Dic.cpp +++ b/simple_ir/Dic.cpp @@ -18,7 +18,7 @@ bool compareLists(List list1, List list2) return (list1.getTerm() < list2.getTerm()); } -List* Dic::getListByTerm(string term) +shared_ptr Dic::getListByTerm(string term) { if(!sorted) { @@ -29,7 +29,7 @@ List* Dic::getListByTerm(string term) // Binary search auto results = equal_range(lists.begin(), lists.end(), searchFor, compareLists); if(results.first != lists.end()) - return &(*results.first); + return shared_ptr(&(*results.first)); return NULL; } @@ -40,6 +40,12 @@ void Dic::addList(string term) sorted = false; } +void Dic::addList(string term, List list) +{ + this->lists.push_back(list); + sorted = false; +} + void Dic::sortLists() { diff --git a/simple_ir/Dic.h b/simple_ir/Dic.h index 8334838..d4cb145 100644 --- a/simple_ir/Dic.h +++ b/simple_ir/Dic.h @@ -4,6 +4,7 @@ #include"Posting.h" #include #include +#include using namespace std; @@ -11,9 +12,12 @@ class Dic { private: vector lists; + bool sorted; public: Dic(); - List* getListByTerm(string term); + shared_ptr getListByTerm(string term); void addList(string term); + void addList(string term, List list); + void sortLists(); }; #endif diff --git a/simple_ir/IndexConstructor.cpp b/simple_ir/IndexConstructor.cpp new file mode 100644 index 0000000..22c21ca --- /dev/null +++ b/simple_ir/IndexConstructor.cpp @@ -0,0 +1,70 @@ +#include "IndexConstructor.h" +#include +#include "TokenReader.h" +#include +#include +#include + +using namespace std; +using namespace boost::filesystem; +IndexConstructor::IndexConstructor(string folderName, int fileLimit) +{ + // Currently we only deal with the Reuter files + if ( !exists( folderName ) ) + cout << "folder " << folderName << "doesn't exit!" << endl; + directory_iterator end_itr; + int i = 0; + for ( directory_iterator itr( folderName ); + itr != end_itr && i != fileLimit; + ++itr, ++i ) + { + if ( is_regular_file(itr->status()) ) + { + files.push_back(itr->path().string()); + } + } + cout << "Done getting all file names. " << files.size() << " files in total." << endl; +} + +int IndexConstructor::getFileId(string fileName) +{ + // Assume files are named as numbers + string idStr = boost::filesystem::path(fileName).stem().string(); + int id; + stringstream(idStr) >> id; + return id; +} + +shared_ptr IndexConstructor::constructIndex() +{ + unordered_map hash; + for(auto file = files.begin(); file != files.end(); file++) + { + int fileId = getFileId(*file); + cout << "Processing file " << fileId << endl; + vector tokens = TokenReader::readAndLowerTokensFromFile(*file); + for(auto token = tokens.begin(); token != tokens.end(); token++) + { + auto list = hash.find(*token); + if(list == hash.end()) + { + List newList(*token); + hash.emplace(*token, newList); + list = hash.find(*token); + } + list->second.addPosting(fileId); + } + } + cout << "Sorting terms" << endl; + shared_ptr dic(new Dic()); + for(auto itr = hash.begin(); itr != hash.end(); itr++) + { + dic->addList(itr->first, itr->second); + } + dic->sortLists(); + return dic; +} + +IndexConstructor::~IndexConstructor() +{ +} diff --git a/simple_ir/IndexConstructor.h b/simple_ir/IndexConstructor.h new file mode 100644 index 0000000..40d4571 --- /dev/null +++ b/simple_ir/IndexConstructor.h @@ -0,0 +1,18 @@ +#pragma once +#include +#include +#include "Dic.h" +#include +using namespace std; + +class IndexConstructor +{ +private: + vector files; + int getFileId(string fileName); +public: + IndexConstructor(string folderName, int fileLimit); + shared_ptr constructIndex(); + ~IndexConstructor(void); +}; + diff --git a/simple_ir/List.cpp b/simple_ir/List.cpp index 73e3fb3..343c3aa 100644 --- a/simple_ir/List.cpp +++ b/simple_ir/List.cpp @@ -24,8 +24,9 @@ int List::getLength() return length; } -void List::addPosting(string docId) +void List::addPosting(int docId) { + // Assume docId are added in increasing order // TODO: frequency currently set to 0 Posting* posting = new Posting(docId, 0); if(head == NULL) @@ -36,9 +37,14 @@ void List::addPosting(string docId) } else { - last->next = posting; - last = last->next; - length ++; + // Because docId are added in increasing order we only need to asure + // last docId in list doesn't equal to the one we are adding. + if(last->getDocId() != docId) + { + last->next = posting; + last = last->next; + length ++; + } } } diff --git a/simple_ir/List.h b/simple_ir/List.h index 2cd554d..e52147e 100644 --- a/simple_ir/List.h +++ b/simple_ir/List.h @@ -12,9 +12,9 @@ class List Posting* last; public: List(string term); - string getTerm(); + string getTerm() const; int getLength(); - void addPosting(string docId); + void addPosting(int docId); Posting* getPostings(); }; #endif diff --git a/simple_ir/Posting.cpp b/simple_ir/Posting.cpp index e149568..44401ff 100644 --- a/simple_ir/Posting.cpp +++ b/simple_ir/Posting.cpp @@ -3,7 +3,7 @@ #include #include"Posting.h" using namespace std; -Posting::Posting(string docId, int fq) +Posting::Posting(int docId, int fq) { this->docId = docId; this->fq = fq; @@ -15,7 +15,7 @@ int Posting::freq() return fq; } -string Posting::getDocId() +int Posting::getDocId() { return docId; } \ No newline at end of file diff --git a/simple_ir/Posting.h b/simple_ir/Posting.h index bcaf940..8976ddd 100644 --- a/simple_ir/Posting.h +++ b/simple_ir/Posting.h @@ -8,12 +8,12 @@ class Posting { private: int fq;//频率 - string docId; + int docId; public: - Posting(string docId, int fq); + Posting(int docId, int fq); Posting* next; - string getDocId(); + int getDocId(); int freq();//得到频率 }; #endif diff --git a/simple_ir/main.cpp b/simple_ir/main.cpp index 79966ab..904375d 100644 --- a/simple_ir/main.cpp +++ b/simple_ir/main.cpp @@ -2,15 +2,25 @@ #include "TokenReader.h" #include "Dic.h" #include "test.h" - +#include "IndexConstructor.h" using namespace std; int main(int argc, char** argv) { - testDic(); - + //testDic(); + IndexConstructor constructor("../Reuters", 100); + auto dic = constructor.constructIndex(); + string term("commercial"); + cout << "Search for term " << term << endl; + auto list = dic->getListByTerm(term); + Posting* posting = list->getPostings(); + while(posting != NULL) + { + cout << posting->getDocId() << endl; + posting = posting->next; + } //TokenReader::readAndLowerTokensFromFile("../Reuters/10.html"); int i; diff --git a/simple_ir/simple_ir.vcxproj b/simple_ir/simple_ir.vcxproj index 54a5fa2..242e4c5 100644 --- a/simple_ir/simple_ir.vcxproj +++ b/simple_ir/simple_ir.vcxproj @@ -40,7 +40,8 @@ $(ReferencePath) - E:\boost_1_55_0;$(IncludePath) + E:\local\boost_1_55_0;$(IncludePath) + E:\local\boost_1_55_0\lib32-msvc-11.0;$(LibraryPath) @@ -66,6 +67,7 @@ + @@ -74,6 +76,7 @@ + diff --git a/simple_ir/test.cpp b/simple_ir/test.cpp index 8be85a1..3168f03 100644 --- a/simple_ir/test.cpp +++ b/simple_ir/test.cpp @@ -2,13 +2,14 @@ #include "Dic.h" #include "test.h" + using namespace std; void testDic() { Dic* dic = new Dic(); dic->sortLists(); - List* list = dic->getListByTerm("a"); + auto list = dic->getListByTerm("a"); if(list != NULL) cout << "Error!" << endl; dic->addList("a"); @@ -43,17 +44,17 @@ void testList() cout << "Error!" << endl; if(list->getLength() != 0) cout << "Error!" << endl; - list->addPosting("1"); - list->addPosting("2"); - list->addPosting("3"); + list->addPosting(1); + list->addPosting(2); + list->addPosting(3); Posting *posting = list->getPostings(); - if(posting->getDocId() != "1") + if(posting->getDocId() != 1) cout << "Error!" << endl; posting = posting->next; - if(posting->getDocId() != "2") + if(posting->getDocId() != 2) cout << "Error!" << endl; posting = posting->next; - if(posting->getDocId() != "3") + if(posting->getDocId() != 3) cout << "Error!" << endl; posting = posting->next; if(posting != NULL)