Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE: ngram profile and simplify #204

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion libursa/Database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ void Database::load_from_disk() {
load_dataset(dataset_fname);
}

profile = working_datasets[0]->generate_ngram_profile();

for (const auto &iterator : db_json["iterators"].items()) {
DatabaseName name(db_base, "iterator", iterator.key(),
iterator.value());
Expand Down Expand Up @@ -288,5 +290,5 @@ DatabaseSnapshot Database::snapshot() {
}

return DatabaseSnapshot(db_name, db_base, config_, iterators, cds,
taskspecs);
taskspecs, &profile);
}
1 change: 1 addition & 0 deletions libursa/Database.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Database {
std::map<std::string, OnDiskIterator> iterators;
std::vector<OnDiskDataset *> working_datasets;
std::vector<std::unique_ptr<OnDiskDataset>> loaded_datasets;
NgramProfile profile;
DatabaseConfig config_;

uint64_t last_task_id;
Expand Down
7 changes: 4 additions & 3 deletions libursa/DatabaseSnapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ DatabaseSnapshot::DatabaseSnapshot(
fs::path db_name, fs::path db_base, DatabaseConfig config,
std::map<std::string, OnDiskIterator> iterators,
std::vector<const OnDiskDataset *> datasets,
std::unordered_map<uint64_t, TaskSpec> tasks)
std::unordered_map<uint64_t, TaskSpec> tasks, NgramProfile *profile)
: db_name(std::move(db_name)),
db_base(std::move(db_base)),
iterators(std::move(iterators)),
config(std::move(config)),
datasets(std::move(datasets)),
tasks(std::move(tasks)) {}
tasks(std::move(tasks)),
profile(profile) {}

const OnDiskDataset *DatabaseSnapshot::find_dataset(
const std::string &name) const {
Expand Down Expand Up @@ -234,7 +235,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
if (!ds->has_all_taints(taints)) {
continue;
}
ds->execute(query, out, &counters);
ds->execute(query, out, &counters, *profile);
}
return counters;
}
Expand Down
4 changes: 3 additions & 1 deletion libursa/DatabaseSnapshot.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class DatabaseSnapshot {
std::map<std::string, OnDiskIterator> iterators;
DatabaseConfig config;
std::vector<const OnDiskDataset *> datasets;
NgramProfile *profile;
std::set<std::string> locked_datasets;
std::set<std::string> locked_iterators;
std::unordered_map<uint64_t, TaskSpec> tasks;
Expand Down Expand Up @@ -57,7 +58,8 @@ class DatabaseSnapshot {
DatabaseSnapshot(fs::path db_name, fs::path db_base, DatabaseConfig config,
std::map<std::string, OnDiskIterator> iterators,
std::vector<const OnDiskDataset *> datasets,
std::unordered_map<uint64_t, TaskSpec> tasks);
std::unordered_map<uint64_t, TaskSpec> tasks,
NgramProfile *profile);

DatabaseName derive_name(const DatabaseName &original,
const std::string &type) const {
Expand Down
15 changes: 13 additions & 2 deletions libursa/OnDiskDataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,15 @@ QueryResult OnDiskDataset::query(const Query &query,
}

void OnDiskDataset::execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const {
QueryCounters *counters, const NgramProfile &profile) const {
std::unordered_set<IndexType> types_to_query;
for (const auto &ndx : get_indexes()) {
types_to_query.emplace(ndx.index_type());
}
const Query plan = query.plan(types_to_query);
PrimitiveEvaluator evaluator = [&profile](PrimitiveQuery primitive) {
return profile.get_length(primitive);
};
const Query plan = query.plan(types_to_query, evaluator);

QueryResult result = this->query(plan, counters);
if (result.is_everything()) {
Expand Down Expand Up @@ -306,3 +309,11 @@ std::vector<const OnDiskDataset *> OnDiskDataset::get_compact_candidates(

return out;
}

NgramProfile OnDiskDataset::generate_ngram_profile() const {
std::map<IndexType, std::vector<uint64_t>> profiles;
for (const auto &index: indices) {
profiles.emplace(index.index_type(), std::move(index.read_run_offsets()));
}
return NgramProfile(std::move(profiles));
}
21 changes: 20 additions & 1 deletion libursa/OnDiskDataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,24 @@
#include "ResultWriter.h"
#include "Task.h"

class NgramProfile {
private:
std::map<IndexType, std::vector<uint64_t>> profiles;

public:
NgramProfile() :profiles() {}
NgramProfile(std::map<IndexType, std::vector<uint64_t>> &&profiles) :profiles(std::move(profiles)) {}
NgramProfile(const NgramProfile &other) = delete;
uint64_t get_length(PrimitiveQuery primitive) const {
for (auto &[key, profile] : profiles) {
if (key == primitive.itype) {
return profile.at(primitive.trigram + 1) - profile.at(primitive.trigram);
}
}
throw std::runtime_error("Unexpected ngram type in ngram profile");
}
};

class OnDiskDataset {
std::string name;
fs::path db_base;
Expand Down Expand Up @@ -43,7 +61,7 @@ class OnDiskDataset {
void toggle_taint(const std::string &taint);
bool has_all_taints(const std::set<std::string> &taints) const;
void execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const;
QueryCounters *counters, const NgramProfile &profile) const;
uint64_t get_file_count() const { return files_index->get_file_count(); }
void for_each_filename(std::function<void(const std::string &)> cb) const {
files_index->for_each_filename(cb);
Expand All @@ -58,6 +76,7 @@ class OnDiskDataset {
const std::set<std::string> &get_taints() const { return taints; }
static std::vector<const OnDiskDataset *> get_compact_candidates(
const std::vector<const OnDiskDataset *> &datasets);
NgramProfile generate_ngram_profile() const;

// Returns vectors of compatible datasets. Datasets are called compatible
// when they can be merged with each other - they have the same types and
Expand Down
1 change: 1 addition & 0 deletions libursa/OnDiskIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class OnDiskIndex {
const std::string &get_fname() const { return fname; }
const fs::path &get_fpath() const { return fpath; }
IndexType index_type() const { return ntype; }
uint64_t run_size_in_bytes(TriGram trigram) const { auto [a, b] = get_run_offsets(trigram); return b - a; }
QueryResult query(const QueryGraph &graph, QueryCounters *counters) const;
QueryResult query(TriGram trigram, QueryCounters *counters) const;
uint64_t real_size() const;
Expand Down
69 changes: 49 additions & 20 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,9 @@ const QString &Query::as_value() const {

std::string Query::as_string_repr() const {
std::string out = "";
if (!query_plan.empty()) {
if (ngram != std::nullopt) {
// Query is already after planning stage. Show low-level representation.
for (const auto &token : query_plan) {
out += fmt::format("[{:x}]", token.trigram);
}
return out;
return fmt::format("{:x}", ngram->trigram);
}
// No query plan yet. Show stringlike representation.
for (const auto &token : value) {
Expand Down Expand Up @@ -273,33 +270,65 @@ std::vector<PrimitiveQuery> plan_qstring(
return std::move(plan);
}

Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
if (type != QueryType::PRIMITIVE) {
uint64_t Query::rarity(const PrimitiveEvaluator &evaluator) const {
if (type == QueryType::PRIMITIVE) {
return evaluator(*ngram);
} else {
return 0x100000000;
}
}

Query Query::plan(const std::unordered_set<IndexType> &types_to_query, const PrimitiveEvaluator &evaluate) const {
if (type == QueryType::PRIMITIVE) {
if (ngram != std::nullopt) {
// Query already as simple as possible
return Query(*ngram);
}

std::vector<Query> plans;
for (const auto &query : queries) {
plans.emplace_back(query.plan(types_to_query));
for (const auto gram : plan_qstring(types_to_query, value)) {
plans.emplace_back(Query(gram));
}
return Query(QueryType::AND, std::move(plans)).plan(types_to_query, evaluate);
}

std::vector<Query> plans;
for (const auto &query : queries) {
plans.emplace_back(query.plan(types_to_query, evaluate));
}

// Special case `1 of ...` (OR) and `n of (1, 2, ... n)` (AND).
if (type == QueryType::MIN_OF) {
if (count == 1) {
return Query(QueryType::OR, std::move(plans)).plan(types_to_query, evaluate);
}
if (type == QueryType::MIN_OF) {
return Query(count, std::move(plans));
if (count == plans.size()) {
return Query(QueryType::AND, std::move(plans)).plan(types_to_query, evaluate);
}
return Query(type, std::move(plans));
return Query(count, std::move(plans));
}

return Query(plan_qstring(types_to_query, value));
// For all other types (AND and OR), flatten and simplify recursively
std::vector<Query> new_plans;
for (auto it = plans.begin(); it != plans.end(); it++) {
if (it->type == type) {
for (auto &subplan : it->queries) {
new_plans.emplace_back(std::move(subplan));
}
} else {
new_plans.emplace_back(std::move(*it));
}
}
std::sort(new_plans.begin(), new_plans.end(), [&evaluate](const auto &l, const auto &r) { return l.rarity(evaluate) < r.rarity(evaluate); });
return Query(type, std::move(new_plans));
}

QueryResult Query::run(const QueryPrimitive &primitive,
QueryCounters *counters) const {
// Case: primitive query - reduces to AND with tokens from query plan.
if (type == QueryType::PRIMITIVE) {
auto result = QueryResult::everything();
for (const auto &token : query_plan) {
result.do_and(primitive(token, counters), &counters->ands());
if (result.is_empty()) {
break;
}
}
return result;
return primitive(*ngram, counters);
}
// Case: and. Short circuits when result is already empty.
if (type == QueryType::AND) {
Expand Down
20 changes: 12 additions & 8 deletions libursa/Query.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ class PrimitiveQuery {
PrimitiveQuery(IndexType itype, TriGram trigram)
: itype(itype), trigram(trigram) {}

const IndexType itype;
const TriGram trigram;
IndexType itype;
TriGram trigram;
};

using QueryPrimitive =
std::function<QueryResult(PrimitiveQuery, QueryCounters *counter)>;

using PrimitiveEvaluator = std::function<uint32_t(PrimitiveQuery)>;

// Query represents the query as provided by the user.
// Query can contain subqueries (using AND/OR/MINOF) or be a literal query.
// There are actually two types of literal query objects - "plain" and
Expand All @@ -40,7 +42,7 @@ using QueryPrimitive =
class Query {
private:
Query(const Query &other)
: type(other.type), query_plan(), count(other.count) {
: type(other.type), ngram(other.ngram), count(other.count) {
queries.reserve(other.queries.size());
for (const auto &query : other.queries) {
queries.emplace_back(query.clone());
Expand All @@ -51,16 +53,17 @@ class Query {
}
}

explicit Query(std::vector<PrimitiveQuery> &&query_plan)
explicit Query(PrimitiveQuery ngram)
: type(QueryType::PRIMITIVE),
query_plan(std::move(query_plan)),
ngram(ngram),
value() {}

public:
explicit Query(QString &&qstr);
explicit Query(uint32_t count, std::vector<Query> &&queries);
explicit Query(const QueryType &type, std::vector<Query> &&queries);
Query(Query &&other) = default;
Query &operator=(Query &&other) = default;

const std::vector<Query> &as_queries() const;
const QString &as_value() const;
Expand All @@ -71,15 +74,16 @@ class Query {

QueryResult run(const QueryPrimitive &primitive,
QueryCounters *counters) const;
Query plan(const std::unordered_set<IndexType> &types_to_query) const;
Query plan(const std::unordered_set<IndexType> &types_to_query, const PrimitiveEvaluator &evaluate) const;
uint64_t rarity(const PrimitiveEvaluator &evaluator) const;

Query clone() const { return Query(*this); }

private:
QueryType type;
// used for QueryType::PRIMITIVE
QString value; // before plan()
std::vector<PrimitiveQuery> query_plan; // after plan()
QString value; // before plan()
std::optional<PrimitiveQuery> ngram; // after plan()
// used for QueryType::MIN_OF
uint32_t count;
// used for QueryType::AND/OR/MIN_OF
Expand Down