Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter ZIMs by humanid & aliases #808

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions include/library.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ enum supportedListMode {
class Filter {
public: // types
using Tags = std::vector<std::string>;
using AliasNames = std::vector<std::string>;

private: // data
uint64_t activeFilters;
Expand All @@ -67,6 +68,7 @@ class Filter {
std::string _query;
bool _queryIsPartial;
std::string _name;
AliasNames _aliasNames;

public: // functions
Filter();
Expand Down Expand Up @@ -112,6 +114,7 @@ class Filter {
Filter& maxSize(size_t size);
Filter& query(std::string query, bool partial=true);
Filter& name(std::string name);
Filter& aliasNames(const AliasNames& aliasNames);

bool hasQuery() const;
const std::string& getQuery() const { return _query; }
Expand All @@ -135,6 +138,8 @@ class Filter {
const Tags& getAcceptTags() const { return _acceptTags; }
const Tags& getRejectTags() const { return _rejectTags; }

const AliasNames& getAliasNames() const { return _aliasNames; }

private: // functions
friend class Library;

Expand Down
1 change: 1 addition & 0 deletions include/name_mapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class HumanReadableNameMapper : public NameMapper {
virtual ~HumanReadableNameMapper() = default;
virtual std::string getNameForId(const std::string& id) const;
virtual std::string getIdForName(const std::string& name) const;
static std::string removeDateFromBookId(const std::string& bookId);
};

class UpdatableNameMapper : public NameMapper {
Expand Down
52 changes: 40 additions & 12 deletions src/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "tools/stringTools.h"
#include "tools/otherTools.h"
#include "tools/concurrent_cache.h"
#include "name_mapper.h"

#include <pugixml.hpp>
#include <algorithm>
Expand Down Expand Up @@ -461,6 +462,9 @@ void Library::updateBookDB(const Book& book)
indexer.index_text(normalizeText(book.getPublisher()), 1, "XP");
indexer.index_text(normalizeText(book.getName()), 1, "XN");
indexer.index_text(normalizeText(book.getCategory()), 1, "XC");
const auto bookName = book.getHumanReadableIdFromPath();
const auto aliasName = HumanReadableNameMapper::removeDateFromBookId(bookName);
indexer.index_text(normalizeText(aliasName), 1, "XF");

for ( const auto& tag : split(normalizeText(book.getTags()), ";") ) {
doc.add_boolean_term("XT" + tag);
Expand Down Expand Up @@ -505,6 +509,7 @@ Xapian::Query buildXapianQueryFromFilterQuery(const Filter& filter)
queryParser.add_prefix("publisher", "XP");
queryParser.add_prefix("creator", "A");
queryParser.add_prefix("tag", "XT");
queryParser.add_prefix("filename", "XF");
const auto partialQueryFlag = filter.queryIsPartial()
? Xapian::QueryParser::FLAG_PARTIAL
: 0;
Expand All @@ -521,6 +526,16 @@ Xapian::Query buildXapianQueryFromFilterQuery(const Filter& filter)
return queryParser.parse_query(normalizeText(filter.getQuery()), flags);
}

Xapian::Query makePhraseQuery(const std::string& query, const std::string& prefix)
{
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_OR);
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
const auto flags = 0;
const auto q = queryParser.parse_query(normalizeText(query), flags, prefix);
return Xapian::Query(Xapian::Query::OP_PHRASE, q.get_terms_begin(), q.get_terms_end(), q.get_length());
}

Xapian::Query nameQuery(const std::string& name)
{
return Xapian::Query("XN" + normalizeText(name));
Expand All @@ -531,29 +546,31 @@ Xapian::Query categoryQuery(const std::string& category)
return Xapian::Query("XC" + normalizeText(category));
}

Xapian::Query aliasNamesQuery(const Filter::AliasNames& aliasNames)
{
Xapian::Query q = Xapian::Query(std::string());
std::vector<Xapian::Query> queryVec;
for (const auto& aliasName : aliasNames) {
queryVec.push_back(makePhraseQuery(aliasName, "XF"));
}
Xapian::Query combinedQuery(Xapian::Query::OP_OR, queryVec.begin(), queryVec.end());
q = Xapian::Query(Xapian::Query::OP_FILTER, q, combinedQuery);
return q;
}

Xapian::Query langQuery(const std::string& lang)
{
return Xapian::Query("L" + normalizeText(lang));
}

Xapian::Query publisherQuery(const std::string& publisher)
{
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_OR);
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
const auto flags = 0;
const auto q = queryParser.parse_query(normalizeText(publisher), flags, "XP");
return Xapian::Query(Xapian::Query::OP_PHRASE, q.get_terms_begin(), q.get_terms_end(), q.get_length());
return makePhraseQuery(publisher, "XP");
}

Xapian::Query creatorQuery(const std::string& creator)
{
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_OR);
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
const auto flags = 0;
const auto q = queryParser.parse_query(normalizeText(creator), flags, "A");
return Xapian::Query(Xapian::Query::OP_PHRASE, q.get_terms_begin(), q.get_terms_end(), q.get_length());
return makePhraseQuery(creator, "A");
}

Xapian::Query tagsQuery(const Filter::Tags& acceptTags, const Filter::Tags& rejectTags)
Expand Down Expand Up @@ -593,6 +610,9 @@ Xapian::Query buildXapianQuery(const Filter& filter)
const auto tq = tagsQuery(filter.getAcceptTags(), filter.getRejectTags());
q = Xapian::Query(Xapian::Query::OP_AND, q, tq);;
}
if ( !filter.getAliasNames().empty() ) {
q = Xapian::Query(Xapian::Query::OP_AND, q, aliasNamesQuery(filter.getAliasNames()));
}
return q;
}

Expand Down Expand Up @@ -742,6 +762,7 @@ enum filterTypes {
QUERY = FLAG(12),
NAME = FLAG(13),
CATEGORY = FLAG(14),
ALIASNAMES = FLAG(15),
};

Filter& Filter::local(bool accept)
Expand Down Expand Up @@ -844,6 +865,13 @@ Filter& Filter::name(std::string name)
return *this;
}

Filter& Filter::aliasNames(const AliasNames& aliasNames)
{
_aliasNames = aliasNames;
activeFilters |= ALIASNAMES;
return *this;
}

#define ACTIVE(X) (activeFilters & (X))
#define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; }
bool Filter::hasQuery() const
Expand Down
6 changes: 5 additions & 1 deletion src/name_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ HumanReadableNameMapper::HumanReadableNameMapper(kiwix::Library& library, bool w
if (!withAlias)
continue;

auto aliasName = replaceRegex(bookName, "", "_[[:digit:]]{4}-[[:digit:]]{2}$");
auto aliasName = removeDateFromBookId(bookName);
if (aliasName == bookName) {
continue;
}
Expand All @@ -51,6 +51,10 @@ HumanReadableNameMapper::HumanReadableNameMapper(kiwix::Library& library, bool w
}
}

std::string HumanReadableNameMapper::removeDateFromBookId(const std::string& bookId) {
return replaceRegex(bookId, "", "_[[:digit:]]{4}-[[:digit:]]{2}$");
}

Comment on lines +54 to +57
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please extract this function in a separate preparatory commit.

std::string HumanReadableNameMapper::getNameForId(const std::string& id) const {
return m_idToName.at(id);
}
Expand Down
3 changes: 3 additions & 0 deletions src/server/internalServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ Filter get_search_filter(const RequestContext& request, const std::string& prefi
try {
filter.rejectTags(kiwix::split(request.get_argument(prefix+"notag"), ";"));
} catch (...) {}
try {
filter.aliasNames(request.get_arguments(prefix + "book"));
} catch (...) {}
return filter;
}

Expand Down
18 changes: 18 additions & 0 deletions test/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,24 @@ TEST_F(LibraryTest, filterByTags)
);
}

TEST_F(LibraryTest, filterByAliasNames)
{
// filtering for one book
EXPECT_FILTER_RESULTS(kiwix::Filter().aliasNames({"zimfile"}),
"Ray Charles"
);

// filerting for more than one book
EXPECT_FILTER_RESULTS(kiwix::Filter().aliasNames({"zimfile", "example"}),
"An example ZIM archive",
"Ray Charles"
);

// filtering by alias name requires full text match
EXPECT_FILTER_RESULTS(kiwix::Filter().aliasNames({"wrong_name"}),
/* no results */
);
}
Comment on lines +503 to +520
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was rather expecting new tests in test/library_server.cpp, that in particular would demonstrate how the new filter is expected to interact with other filters?


TEST_F(LibraryTest, filterByQuery)
{
Expand Down