Skip to content

Commit

Permalink
Added a class for basic in-memory index construction
Browse files Browse the repository at this point in the history
  • Loading branch information
Wang Xiaojian committed Jun 17, 2014
1 parent 419bebc commit 370fc32
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 25 deletions.
10 changes: 8 additions & 2 deletions simple_ir/Dic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ bool compareLists(List list1, List list2)
return (list1.getTerm() < list2.getTerm());
}

List* Dic::getListByTerm(string term)
shared_ptr<List> Dic::getListByTerm(string term)
{
if(!sorted)
{
Expand All @@ -29,7 +29,7 @@ List* Dic::getListByTerm(string term)
// Binary search
auto results = equal_range(lists.begin(), lists.end(), searchFor, compareLists);
if(results.first != lists.end())
return &(*results.first);
return shared_ptr<List>(&(*results.first));
return NULL;
}

Expand All @@ -40,6 +40,12 @@ void Dic::addList(string term)
sorted = false;
}

void Dic::addList(string term, List list)
{
this->lists.push_back(list);
sorted = false;
}


void Dic::sortLists()
{
Expand Down
6 changes: 5 additions & 1 deletion simple_ir/Dic.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@
#include"Posting.h"
#include <string>
#include <vector>
#include <memory>

using namespace std;

class Dic
{
private:
vector<List> lists;
bool sorted;
public:
Dic();
List* getListByTerm(string term);
shared_ptr<List> getListByTerm(string term);
void addList(string term);
void addList(string term, List list);
void sortLists();
};
#endif
70 changes: 70 additions & 0 deletions simple_ir/IndexConstructor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include "IndexConstructor.h"
#include <boost/filesystem.hpp>
#include "TokenReader.h"
#include <sstream>
#include <memory>
#include <unordered_map>

using namespace std;
using namespace boost::filesystem;
IndexConstructor::IndexConstructor(string folderName, int fileLimit)
{
// Currently we only deal with the Reuter files
if ( !exists( folderName ) )
cout << "folder " << folderName << "doesn't exit!" << endl;
directory_iterator end_itr;
int i = 0;
for ( directory_iterator itr( folderName );
itr != end_itr && i != fileLimit;
++itr, ++i )
{
if ( is_regular_file(itr->status()) )
{
files.push_back(itr->path().string());
}
}
cout << "Done getting all file names. " << files.size() << " files in total." << endl;
}

int IndexConstructor::getFileId(string fileName)
{
// Assume files are named as numbers
string idStr = boost::filesystem::path(fileName).stem().string();
int id;
stringstream(idStr) >> id;
return id;
}

shared_ptr<Dic> IndexConstructor::constructIndex()
{
unordered_map<string ,List> hash;
for(auto file = files.begin(); file != files.end(); file++)
{
int fileId = getFileId(*file);
cout << "Processing file " << fileId << endl;
vector<string> tokens = TokenReader::readAndLowerTokensFromFile(*file);
for(auto token = tokens.begin(); token != tokens.end(); token++)
{
auto list = hash.find(*token);
if(list == hash.end())
{
List newList(*token);
hash.emplace(*token, newList);
list = hash.find(*token);
}
list->second.addPosting(fileId);
}
}
cout << "Sorting terms" << endl;
shared_ptr<Dic> dic(new Dic());
for(auto itr = hash.begin(); itr != hash.end(); itr++)
{
dic->addList(itr->first, itr->second);
}
dic->sortLists();
return dic;
}

IndexConstructor::~IndexConstructor()
{
}
18 changes: 18 additions & 0 deletions simple_ir/IndexConstructor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once
#include <string>
#include <vector>
#include "Dic.h"
#include <memory>
using namespace std;

class IndexConstructor
{
private:
vector<string> files;
int getFileId(string fileName);
public:
IndexConstructor(string folderName, int fileLimit);
shared_ptr<Dic> constructIndex();
~IndexConstructor(void);
};

14 changes: 10 additions & 4 deletions simple_ir/List.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ int List::getLength()
return length;
}

void List::addPosting(string docId)
void List::addPosting(int docId)
{
// Assume docId are added in increasing order
// TODO: frequency currently set to 0
Posting* posting = new Posting(docId, 0);
if(head == NULL)
Expand All @@ -36,9 +37,14 @@ void List::addPosting(string docId)
}
else
{
last->next = posting;
last = last->next;
length ++;
// Because docId are added in increasing order we only need to asure
// last docId in list doesn't equal to the one we are adding.
if(last->getDocId() != docId)
{
last->next = posting;
last = last->next;
length ++;
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions simple_ir/List.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ class List
Posting* last;
public:
List(string term);
string getTerm();
string getTerm() const;
int getLength();
void addPosting(string docId);
void addPosting(int docId);
Posting* getPostings();
};
#endif
4 changes: 2 additions & 2 deletions simple_ir/Posting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include<string>
#include"Posting.h"
using namespace std;
Posting::Posting(string docId, int fq)
Posting::Posting(int docId, int fq)
{
this->docId = docId;
this->fq = fq;
Expand All @@ -15,7 +15,7 @@ int Posting::freq()
return fq;
}

string Posting::getDocId()
int Posting::getDocId()
{
return docId;
}
6 changes: 3 additions & 3 deletions simple_ir/Posting.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ class Posting
{
private:
int fq;//频率
string docId;
int docId;

public:
Posting(string docId, int fq);
Posting(int docId, int fq);
Posting* next;
string getDocId();
int getDocId();
int freq();//得到频率
};
#endif
16 changes: 13 additions & 3 deletions simple_ir/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,25 @@
#include "TokenReader.h"
#include "Dic.h"
#include "test.h"

#include "IndexConstructor.h"
using namespace std;



int main(int argc, char** argv)
{
testDic();

//testDic();
IndexConstructor constructor("../Reuters", 100);
auto dic = constructor.constructIndex();
string term("commercial");
cout << "Search for term " << term << endl;
auto list = dic->getListByTerm(term);
Posting* posting = list->getPostings();
while(posting != NULL)
{
cout << posting->getDocId() << endl;
posting = posting->next;
}
//TokenReader::readAndLowerTokensFromFile("../Reuters/10.html");

int i;
Expand Down
5 changes: 4 additions & 1 deletion simple_ir/simple_ir.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ReferencePath>$(ReferencePath)</ReferencePath>
<IncludePath>E:\boost_1_55_0;$(IncludePath)</IncludePath>
<IncludePath>E:\local\boost_1_55_0;$(IncludePath)</IncludePath>
<LibraryPath>E:\local\boost_1_55_0\lib32-msvc-11.0;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
Expand All @@ -66,6 +67,7 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="Dic.cpp" />
<ClCompile Include="IndexConstructor.cpp" />
<ClCompile Include="List.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="Posting.cpp" />
Expand All @@ -74,6 +76,7 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="Dic.h" />
<ClInclude Include="IndexConstructor.h" />
<ClInclude Include="List.h" />
<ClInclude Include="Posting.h" />
<ClInclude Include="test.h" />
Expand Down
15 changes: 8 additions & 7 deletions simple_ir/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
#include "Dic.h"
#include "test.h"


using namespace std;

void testDic()
{
Dic* dic = new Dic();
dic->sortLists();
List* list = dic->getListByTerm("a");
auto list = dic->getListByTerm("a");
if(list != NULL)
cout << "Error!" << endl;
dic->addList("a");
Expand Down Expand Up @@ -43,17 +44,17 @@ void testList()
cout << "Error!" << endl;
if(list->getLength() != 0)
cout << "Error!" << endl;
list->addPosting("1");
list->addPosting("2");
list->addPosting("3");
list->addPosting(1);
list->addPosting(2);
list->addPosting(3);
Posting *posting = list->getPostings();
if(posting->getDocId() != "1")
if(posting->getDocId() != 1)
cout << "Error!" << endl;
posting = posting->next;
if(posting->getDocId() != "2")
if(posting->getDocId() != 2)
cout << "Error!" << endl;
posting = posting->next;
if(posting->getDocId() != "3")
if(posting->getDocId() != 3)
cout << "Error!" << endl;
posting = posting->next;
if(posting != NULL)
Expand Down

0 comments on commit 370fc32

Please sign in to comment.