Skip to content

Commit

Permalink
Implement --debug-rules; Fix indexSetToRule() with mixed special and …
Browse files Browse the repository at this point in the history
…normal sets; Bump some indexes to 64 bit to combat collisions
  • Loading branch information
TinoDidriksen committed Aug 12, 2024
1 parent 16e34b3 commit 84face0
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 24 deletions.
1 change: 0 additions & 1 deletion src/Grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 +1126,6 @@ inline void trie_indexToRule(const trie_t& trie, Grammar& grammar, uint32_t r) {
void Grammar::indexSetToRule(uint32_t r, Set* s) {
if (s->type & (ST_SPECIAL | ST_TAG_UNIFY)) {
indexTagToRule(tag_any, r);
return;
}

trie_indexToRule(s->trie, *this, r);
Expand Down
37 changes: 33 additions & 4 deletions src/GrammarApplicator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ class GrammarApplicator {
uint32Vector sections;
uint32IntervalVector valid_rules;
uint32IntervalVector trace_rules;
uint32IntervalVector debug_rules;
uint32FlatHashMap variables;
uint32_t verbosity_level = 0;
uint32_t debug_level = 0;
Expand Down Expand Up @@ -279,10 +280,10 @@ class GrammarApplicator {
scoped_stack<unif_sets_t> ss_usets;
scoped_stack<uint32SortedVector> ss_u32sv;

uint32FlatHashSet index_regexp_yes;
uint32FlatHashSet index_regexp_no;
uint32FlatHashSet index_icase_yes;
uint32FlatHashSet index_icase_no;
uint64FlatHashSet index_regexp_yes;
uint64FlatHashSet index_regexp_no;
uint64FlatHashSet index_icase_yes;
uint64FlatHashSet index_icase_no;
std::vector<uint32FlatHashSet> index_readingSet_yes;
std::vector<uint32FlatHashSet> index_readingSet_no;
uint32FlatHashSet index_ruleCohort_no;
Expand Down Expand Up @@ -358,6 +359,34 @@ class GrammarApplicator {
std::deque<Reading> subs_any;
Reading* get_sub_reading(Reading* tr, int sub_reading);

void printDebugRule(const Rule& rule, bool target = true, bool cntx = true) {
static std::stringstream buf;

bool ttrace = false;
swapper<bool> _st(true, trace, ttrace);

// Whole context, both before and after current window
buf.str("");
buf.clear();

buf << "# ===== BEGIN RULE " << rule.line << (target ? " TARGET-MATCH" : " TARGET-FAIL") << (cntx ? " CONTEXT-MATCH" : " CONTEXT-FAIL") << " =====\n";

buf << "# PREVIOUS WINDOWS\n";
for (auto s : gWindow->previous) {
printSingleWindow(s, buf, true);
}
buf << "# CURRENT WINDOW\n";
printSingleWindow(gWindow->current, buf, true);
buf << "# NEXT WINDOWS\n";
for (auto s : gWindow->next) {
printSingleWindow(s, buf, true);
}

buf << "# ===== END RULE " << rule.line << " =====\n";

u_fprintf(ux_stderr, "%s", buf.str().c_str());
}

template<typename T>
void addProfilingExample(T& item) {
auto& buf = profiler->buf;
Expand Down
22 changes: 11 additions & 11 deletions src/GrammarApplicator_matchSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo
UErrorCode status = U_ZERO_ERROR;
int32_t gc = uregex_groupCount(tag.regexp, &status);
uint32_t match = 0;
uint32_t ih = hash_value(tag.hash, test);
if (!bypass_index && index_matches(index_regexp_no, ih)) {
auto ih = (UI64(tag.hash) << 32) | test;
if (!bypass_index && index_regexp_no.contains(ih)) {
match = 0;
}
else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) {
else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) {
match = test;
}
else {
Expand Down Expand Up @@ -130,11 +130,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo

uint32_t GrammarApplicator::doesTagMatchIcase(uint32_t test, const Tag& tag, bool bypass_index) {
uint32_t match = 0;
uint32_t ih = hash_value(tag.hash, test);
if (!bypass_index && index_matches(index_icase_no, ih)) {
auto ih = (UI64(tag.hash) << 32) | test;
if (!bypass_index && index_icase_no.contains(ih)) {
match = 0;
}
else if (!bypass_index && index_matches(index_icase_yes, ih)) {
else if (!bypass_index && index_icase_yes.contains(ih)) {
match = test;
}
else {
Expand All @@ -157,11 +157,11 @@ uint32_t GrammarApplicator::doesRegexpMatchLine(const Reading& reading, const Ta
UErrorCode status = U_ZERO_ERROR;
int32_t gc = uregex_groupCount(tag.regexp, &status);
uint32_t match = 0;
uint32_t ih = hash_value(reading.tags_string_hash, tag.hash);
if (!bypass_index && index_matches(index_regexp_no, ih)) {
auto ih = (UI64(reading.tags_string_hash) << 32) | tag.hash;
if (!bypass_index && index_regexp_no.contains(ih)) {
match = 0;
}
else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) {
else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) {
match = reading.tags_string_hash;
}
else {
Expand Down Expand Up @@ -671,10 +671,10 @@ bool GrammarApplicator::doesSetMatchReading(const Reading& reading, const uint32
// Only 30% of tests get past this.
// ToDo: This is not good enough...while numeric tags are special, their failures can be indexed.
if (!bypass_index && !unif_mode) {
if (index_readingSet_no[set].find(reading.hash) != index_readingSet_no[set].end()) {
if (index_readingSet_no[set].contains(reading.hash)) {
return false;
}
if (index_readingSet_yes[set].find(reading.hash) != index_readingSet_yes[set].end()) {
if (index_readingSet_yes[set].contains(reading.hash)) {
return true;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/GrammarApplicator_runContextualTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,7 @@ Cohort* GrammarApplicator::runDependencyTest(SingleWindow* sWindow, Cohort* curr

// ToDo: Now that dep_deep_seen is a composite, investigate all .clear() to see if they're needed
if (test->pos & POS_DEP_DEEP) {
if (index_matches(dep_deep_seen, std::make_pair(test->hash, current->global_number))) {
if (dep_deep_seen.contains(std::make_pair(test->hash, current->global_number))) {
return 0;
}
dep_deep_seen.insert(std::make_pair(test->hash, current->global_number));
Expand Down
18 changes: 17 additions & 1 deletion src/GrammarApplicator_runRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
// Check if on previous runs the rule did not match this cohort, and skip if that is the case.
// This cache is cleared if any rule causes any state change in the window.
uint32_t ih = hash_value(rule.number, cohort->global_number);
if (index_matches(index_ruleCohort_no, ih)) {
if (index_ruleCohort_no.contains(ih)) {
continue;
}
index_ruleCohort_no.insert(ih);
Expand Down Expand Up @@ -640,8 +640,15 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
for (auto r = cohort->readings[i]; r; r = r->next) {
r->active = true;
}
if (rule.line == 2746) {
cohort = cohort;
}
rule_target = cohort;
// Actually check if the reading is a valid target. First check if rule target matches...
if (rule.target && doesSetMatchReading(*reading, rule.target, (set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) != 0)) {
if (rule.line == 2746) {
cohort = cohort;
}
bool regex_prop = true;
if (orz != context_stack.back().regexgrp_ct) {
did_test = false;
Expand Down Expand Up @@ -724,6 +731,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
addProfilingExample(r);
}
}
if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
printDebugRule(rule);
}

if (regex_prop && i && !regexgrps_c.empty()) {
for (auto z = i; z > 0; --z) {
Expand All @@ -738,6 +748,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
}
else {
context_stack.back().regexgrp_ct = orz;
if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
printDebugRule(rule, true, false);
}
}
++num_iff;
}
Expand All @@ -747,6 +760,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
Profiler::Key k{ ET_RULE, rule.number + 1 };
++profiler->entries[k].num_fail;
}
if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
printDebugRule(rule, false, false);
}
}
readings_plain.insert(std::make_pair(reading->hash_plain, reading));
for (auto r = cohort->readings[i]; r; r = r->next) {
Expand Down
5 changes: 5 additions & 0 deletions src/flat_unordered_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,10 @@ class flat_unordered_set {
return (find(t) != end());
}

bool contains(T t) const {
return (find(t) != end());
}

const_iterator begin() const {
if (size_ == 0) {
return end();
Expand Down Expand Up @@ -326,6 +330,7 @@ class flat_unordered_set {
};

using uint32FlatHashSet = flat_unordered_set<uint32_t>;
using uint64FlatHashSet = flat_unordered_set<uint64_t>;
}

#endif
5 changes: 0 additions & 5 deletions src/inlines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,11 +475,6 @@ inline bool is_cg3b(const S& s) {
return (s[0] == 'C' && s[1] == 'G' && s[2] == '3' && s[3] == 'B');
}

template<typename Cont, typename VT>
inline bool index_matches(const Cont& index, const VT& entry) {
return (index.find(entry) != index.end());
}

inline void insert_if_exists(boost::dynamic_bitset<>& cont, const boost::dynamic_bitset<>* other) {
if (other && !other->empty()) {
cont.resize(std::max(cont.size(), other->size()));
Expand Down
5 changes: 5 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,11 @@ void GAppSetOpts(GrammarApplicator& applicator, UConverter* conv) {
}
}
}
if (options[DEBUG_RULES].doesOccur) {
if (!options[DEBUG_RULES].value.empty()) {
GAppSetOpts_ranged(options[DEBUG_RULES].value.c_str(), applicator.debug_rules, false);
}
}
if (options[VERBOSE].doesOccur) {
if (!options[VERBOSE].value.empty()) {
applicator.verbosity_level = std::stoul(options[VERBOSE].value);
Expand Down
2 changes: 2 additions & 0 deletions src/options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ enum OPTIONS {
NRULES,
NRULES_INV,
DODEBUG,
DEBUG_RULES,
VERBOSE,
QUIET,
VISLCGCOMPAT,
Expand Down Expand Up @@ -108,6 +109,7 @@ std::array<UOption,NUM_OPTIONS> options{
UOption{"nrules", 0, UOPT_REQUIRES_ARG, "a regex for which rule names to parse/run; defaults to all rules"},
UOption{"nrules-v", 0, UOPT_REQUIRES_ARG, "a regex for which rule names not to parse/run"},
UOption{"debug", 'd', UOPT_OPTIONAL_ARG, "enables debug output (very noisy)"},
UOption{"debug-rules", 0, UOPT_OPTIONAL_ARG, "number or ranges of rules to debug; defaults to all rules"},
UOption{"verbose", 'v', UOPT_OPTIONAL_ARG, "increases verbosity"},
UOption{"quiet", 0, UOPT_NO_ARG, "squelches warnings (same as -v 0)"},
UOption{"vislcg-compat", '2', UOPT_NO_ARG, "enables compatibility mode for older CG-2 and vislcg grammars"},
Expand Down
4 changes: 4 additions & 0 deletions src/sorted_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ class sorted_vector {
return (find(t) != end());
}

bool contains(T t) const {
return (find(t) != end());
}

iterator begin() {
return elements.begin();
}
Expand Down
2 changes: 1 addition & 1 deletion src/version.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ constexpr auto CG3_COPYRIGHT_STRING = "Copyright (C) 2007-2024 GrammarSoft ApS.

constexpr uint32_t CG3_VERSION_MAJOR = 1;
constexpr uint32_t CG3_VERSION_MINOR = 4;
constexpr uint32_t CG3_VERSION_PATCH = 16;
constexpr uint32_t CG3_VERSION_PATCH = 17;
constexpr uint32_t CG3_REVISION = 13898;
constexpr uint32_t CG3_FEATURE_REV = 13898;
constexpr uint32_t CG3_TOO_OLD = 10373;
Expand Down

0 comments on commit 84face0

Please sign in to comment.