From 932e502891ff88e1c679fc3afa34bb424ce9fb73 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 16:21:42 +0000 Subject: [PATCH 1/6] Output the right relations in PCleanSchemaHelper::make_hirm_schema --- cxx/pclean/schema_helper.cc | 87 ++++++++++++++++++-------------- cxx/pclean/schema_helper.hh | 25 +++++---- cxx/pclean/schema_helper_test.cc | 82 +++++++++++++++++++++++++----- 3 files changed, 129 insertions(+), 65 deletions(-) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 85ef91d..4af3071 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -12,26 +12,32 @@ void PCleanSchemaHelper::compute_class_name_cache() { } } -void PCleanSchemaHelper::compute_ancestors_cache() { +void PCleanSchemaHelper::compute_domains_cache() { for (const auto& c: schema.classes) { - if (!ancestors.contains(c.name)) { - ancestors[c.name] = compute_ancestors_for(c.name); + if (!domains.contains(c.name)) { + domains[c.name] = compute_domains_for(c.name); } } } -std::set PCleanSchemaHelper::compute_ancestors_for( +std::vector PCleanSchemaHelper::compute_domains_for( const std::string& name) { - std::set ancs; - std::set parents = get_parent_classes(name); - for (const std::string& p: parents) { - ancs.insert(p); - if (!ancestors.contains(p)) { - ancestors[p] = compute_ancestors_for(p); + std::vector ds; + ds.push_back(name); + PCleanClass c = get_class_by_name(name); + + for (const auto& v: c.vars) { + if (const ClassVar* cv = std::get_if(&(v.spec))) { + if (!domains.contains(cv->class_name)) { + domains[cv->class_name] = compute_domains_for(cv->class_name); + } + for (const std::string& s : domains[cv->class_name]) { + ds.push_back(v.name + ':' + s); + } } - ancs.insert(ancestors[p].cbegin(), ancestors[p].cend()); } - return ancs; + + return ds; } PCleanClass PCleanSchemaHelper::get_class_by_name(const std::string& name) { @@ -43,32 +49,33 @@ std::set PCleanSchemaHelper::get_parent_classes( std::set parents; PCleanClass c = get_class_by_name(name); for (const auto& v: c.vars) { - if (const ClassVar* cv = std::get_if(&(v.spec))) { parents.insert(cv->class_name); } } return parents; } -std::set PCleanSchemaHelper::get_ancestor_classes( - const std::string& name) { - return ancestors[name]; -} - -std::string get_base_relation_name( - const PCleanClass& c, const std::vector& field_path) { - assert(field_path.size() == 2); - const std::string& class_var = field_path[0]; - const std::string& var_name = field_path[1]; - std::string class_name = ""; - for (const auto& v : c.vars) { - if (v.name == class_var) { - class_name = std::get(v.spec).class_name; - break; +PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( + const PCleanClass& base_class, + std::vector::const_iterator path_iterator, + std::string* final_class_name) { + const std::string& s = *path_iterator; + for (const PCleanVariable& v : base_class.vars) { + if (v.name == s) { + if (std::has_alternative(v.spec)) { + *final_class_name = base_class.name; + return v; + } + const PCleanClass& next_class = get_class_by_name( + std::get(v.spec).class_name); + PCleanVariable sv = get_scalarvar_from_path( + next_class, ++path_iterator, final_class_name); + return sv; } } - assert(class_name != ""); - return class_name + ':' + var_name; + printf("Error: could not find name %s in class %s\n", + s.c_str(), base_class.name.c_str()); + assert(false); } T_schema PCleanSchemaHelper::make_hirm_schema() { @@ -77,16 +84,20 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { for (const auto& v : c.vars) { std::string rel_name = c.name + ':' + v.name; if (const ScalarVar* dv = std::get_if(&(v.spec))) { - std::vector domains; - domains.push_back(c.name); - for (const std::string& sc : get_ancestor_classes(c.name)) { - domains.push_back(sc); - } - tschema[rel_name] = get_distribution_relation(*dv, domains); + tschema[rel_name] = get_distribution_relation(*dv, domains[c.name]); } - // TODO(thomaswc): If this class isn't the observation class, - // create additional noisy relations. } } + + const PCleanClass query_class = get_class_by_name(schema.query.base_class); + for (const auto& f : schema.query.fields) { + std::string final_class_name; + const PCleanVariable sv = get_scalarvar_from_path( + query_class, f.class_path.cbegin(), &final_class_name); + std::string base_relation = final_class_name + ':' + sv.name; + tschema[f.name] = get_emission_relation( + std::get(sv.spec), domains[query_class.name], base_relation); + } + return tschema; } diff --git a/cxx/pclean/schema_helper.hh b/cxx/pclean/schema_helper.hh index 8a13bb0..b7529cb 100644 --- a/cxx/pclean/schema_helper.hh +++ b/cxx/pclean/schema_helper.hh @@ -17,23 +17,22 @@ class PCleanSchemaHelper { PCleanClass get_class_by_name(const std::string& name); - // The parent classes of a class are those that are referred to by a - // ClassVar inside the class. - std::set get_parent_classes(const std::string& name); - // The ancestors of a class are the transitive closure of the parent - // relationship. - std::set get_ancestor_classes(const std::string& name); - // The source classes of a class are its ancestors without parents. - std::set get_source_classes(const std::string& name); - T_schema make_hirm_schema(); - private: + // The rest of these methods are conceptually private, but actually + // public for testing. + void compute_class_name_cache(); - void compute_ancestors_cache(); - std::set compute_ancestors_for(const std::string& name); + void compute_domains_cache(); + + std::vector compute_domains_for(const std::string& name); + + PCleanVariable get_scalarvar_from_path( + const PCleanClass& base_class, + std::vector::const_iterator path_iterator, + std::string* final_class_name); PCleanSchema schema; std::map class_name_to_index; - std::map> ancestors; + std::map> domains; }; diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index 0211412..1999755 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -33,7 +33,7 @@ class Record observe physician.specialty as Specialty physician.school.name as School - physician.observed_degree as Degree + physician.degree as Degree location.city.name as City location.city.state as State from Record @@ -54,22 +54,43 @@ BOOST_AUTO_TEST_CASE(test_get_class_by_name) { BOOST_TEST(c.name == "Practice"); } -BOOST_AUTO_TEST_CASE(test_get_parent_classes) { +BOOST_AUTO_TEST_CASE(test_domains_cache) { PCleanSchemaHelper schema_helper(schema); - BOOST_TEST(schema_helper.get_parent_classes("School").empty()); - BOOST_TEST(schema_helper.get_parent_classes("City").empty()); - BOOST_TEST(schema_helper.get_parent_classes("Physician").size() == 1); - BOOST_TEST(schema_helper.get_parent_classes("Practice").size() == 1); - BOOST_TEST(schema_helper.get_parent_classes("Record").size() == 2); + + std::vector expected_domains = {"School"}; + BOOST_TEST(schema_helper.domains["School"] == expected_domains); + + expected_domains = {"Physician", "school:School"}; + BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); + + expected_domains = {"City"}; + BOOST_TEST(schema_helper.domains["City"] == expected_domains); + + expected_domains = {"Practice", "city:City"}; + BOOST_TEST(schema_helper.domains["Practice"] == expected_domains); + + expected_domains = { + "Record", "physician:Physician", "physician:school:School", + "location:Practice", "location:city:City"}; + BOOST_TEST(schema_helper.domains["Record"] == expected_domains); } -BOOST_AUTO_TEST_CASE(test_get_ancestor_classes) { +BOOST_AUTO_TEST_CASE(test_domains_cache_two_paths_same_source) { + std::stringstream ss(R"""( +class City + name ~ string + +class Person + birth_city ~ City + home_city ~ City +)"""); + PCleanSchema schema; + assert(read_schema(ss, &schema)); PCleanSchemaHelper schema_helper(schema); - BOOST_TEST(schema_helper.get_ancestor_classes("School").empty()); - BOOST_TEST(schema_helper.get_ancestor_classes("City").empty()); - BOOST_TEST(schema_helper.get_ancestor_classes("Physician").size() == 1); - BOOST_TEST(schema_helper.get_ancestor_classes("Practice").size() == 1); - BOOST_TEST(schema_helper.get_ancestor_classes("Record").size() == 4); + + std::vector expected_domains = { + "Person", "birth_city:City", "home_city:City"}; + BOOST_TEST(schema_helper.domains["Person"] == expected_domains); } BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { @@ -92,7 +113,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("Physician:degree")); T_clean_relation cr3 = std::get(tschema["Physician:degree"]); BOOST_TEST((cr3.distribution_spec.distribution == DistributionEnum::stringcat)); - std::vector expected_domains2 = {"Physician", "School"}; + std::vector expected_domains2 = {"Physician", "schoool:School"}; BOOST_TEST(cr3.domains == expected_domains2); BOOST_TEST(tschema.contains("Physician:specialty")); @@ -103,6 +124,39 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(cr4.domains == expected_domains3); BOOST_TEST(tschema.contains("City:state")); + + BOOST_TEST(tschema.contains("Specialty")); + T_noisy_relation nr1 = std::get(tschema["Specialty"]); + BOOST_TEST(!nr1.is_observed); + BOOST_TEST((nr1.emission_spec.emission == EmissionEnum::bigram_string)); + expected_domains = { + "Record", "physician:Physician", "physician:school:School", + "location:Practice", "location:city:City"}; + BOOST_TEST(nr1.domains == expected_domains); + + BOOST_TEST(tschema.contains("School")); + T_noisy_relation nr2 = std::get(tschema["School"]); + BOOST_TEST(!nr2.is_observed); + BOOST_TEST((nr2.emission_spec.emission == EmissionEnum::bigram_string)); + BOOST_TEST(nr2.domains == expected_domains); + + BOOST_TEST(tschema.contains("Degree")); + T_noisy_relation nr3 = std::get(tschema["Degree"]); + BOOST_TEST(!nr3.is_observed); + BOOST_TEST((nr3.emission_spec.emission == EmissionEnum::bigram_string)); + BOOST_TEST(nr3.domains == expected_domains); + + BOOST_TEST(tschema.contains("City")); + T_noisy_relation nr4 = std::get(tschema["City"]); + BOOST_TEST(!nr4.is_observed); + BOOST_TEST((nr4.emission_spec.emission == EmissionEnum::bigram_string)); + BOOST_TEST(nr4.domains == expected_domains); + + BOOST_TEST(tschema.contains("State")); + T_noisy_relation nr5 = std::get(tschema["State"]); + BOOST_TEST(!nr5.is_observed); + BOOST_TEST((nr5.emission_spec.emission == EmissionEnum::bigram_string)); + BOOST_TEST(nr5.domains == expected_domains); } BOOST_AUTO_TEST_SUITE_END() From 4fe9b9a9fb35a35a8d1bdc1e6066b192f487f82c Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 16:26:03 +0000 Subject: [PATCH 2/6] Fix build and test errors --- cxx/pclean/schema_helper.cc | 15 ++------------- cxx/pclean/schema_helper_test.cc | 2 +- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 4af3071..03be888 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -3,7 +3,7 @@ PCleanSchemaHelper::PCleanSchemaHelper(const PCleanSchema& s): schema(s) { compute_class_name_cache(); - compute_ancestors_cache(); + compute_domains_cache(); } void PCleanSchemaHelper::compute_class_name_cache() { @@ -44,17 +44,6 @@ PCleanClass PCleanSchemaHelper::get_class_by_name(const std::string& name) { return schema.classes[class_name_to_index[name]]; } -std::set PCleanSchemaHelper::get_parent_classes( - const std::string& name) { - std::set parents; - PCleanClass c = get_class_by_name(name); - for (const auto& v: c.vars) { - parents.insert(cv->class_name); - } - } - return parents; -} - PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( const PCleanClass& base_class, std::vector::const_iterator path_iterator, @@ -62,7 +51,7 @@ PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( const std::string& s = *path_iterator; for (const PCleanVariable& v : base_class.vars) { if (v.name == s) { - if (std::has_alternative(v.spec)) { + if (std::holds_alternative(v.spec)) { *final_class_name = base_class.name; return v; } diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index 1999755..f7d6eb5 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -113,7 +113,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("Physician:degree")); T_clean_relation cr3 = std::get(tschema["Physician:degree"]); BOOST_TEST((cr3.distribution_spec.distribution == DistributionEnum::stringcat)); - std::vector expected_domains2 = {"Physician", "schoool:School"}; + std::vector expected_domains2 = {"Physician", "school:School"}; BOOST_TEST(cr3.domains == expected_domains2); BOOST_TEST(tschema.contains("Physician:specialty")); From 993044ec34169b139f6124cc3c9ad8960167ef71 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 19:14:26 +0000 Subject: [PATCH 3/6] Respond to reviewer comments --- cxx/pclean/io.cc | 6 +++--- cxx/pclean/io_test.cc | 2 +- cxx/pclean/schema.hh | 2 +- cxx/pclean/schema_helper.cc | 2 +- cxx/pclean/schema_helper_test.cc | 25 +++++++++++++++++++++++++ 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/cxx/pclean/io.cc b/cxx/pclean/io.cc index aeeb398..ddff934 100644 --- a/cxx/pclean/io.cc +++ b/cxx/pclean/io.cc @@ -133,11 +133,11 @@ bool read_query(std::istream& is, PCleanQuery* query) { printf("Expected exactly two tokens on query from line %s", line.c_str()); return false; } - if (!query->base_class.empty()) { + if (!query->record_class.empty()) { printf("Expected exactly one `from` clause in query.\n"); return false; } - query->base_class = tokens[1].val; + query->record_class = tokens[1].val; continue; } @@ -209,7 +209,7 @@ bool read_schema(std::istream& is, PCleanSchema* schema) { } if (tokens[0].val == "observe") { - if (!schema->query.base_class.empty()) { + if (!schema->query.record_class.empty()) { printf("Error reading schema line %s: only one query is allowed\n", line.c_str()); return false; diff --git a/cxx/pclean/io_test.cc b/cxx/pclean/io_test.cc index 3cc03be..45d30d1 100644 --- a/cxx/pclean/io_test.cc +++ b/cxx/pclean/io_test.cc @@ -41,7 +41,7 @@ observe )"""); PCleanSchema schema; BOOST_TEST(read_schema(ss, &schema)); - BOOST_TEST(schema.query.base_class == "Record"); + BOOST_TEST(schema.query.record_class == "Record"); BOOST_TEST(schema.query.fields.size() == 5); BOOST_TEST(schema.query.fields[0].name == "Specialty"); diff --git a/cxx/pclean/schema.hh b/cxx/pclean/schema.hh index 1186ed3..022738b 100644 --- a/cxx/pclean/schema.hh +++ b/cxx/pclean/schema.hh @@ -38,7 +38,7 @@ struct QueryField { }; struct PCleanQuery { - std::string base_class; + std::string record_class; std::vector fields; }; diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 03be888..7b56b38 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -78,7 +78,7 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { } } - const PCleanClass query_class = get_class_by_name(schema.query.base_class); + const PCleanClass query_class = get_class_by_name(schema.query.record_class); for (const auto& f : schema.query.fields) { std::string final_class_name; const PCleanVariable sv = get_scalarvar_from_path( diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index f7d6eb5..65fc1a9 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -93,6 +93,31 @@ class Person BOOST_TEST(schema_helper.domains["Person"] == expected_domains); } +BOOST_AUTO_TEST_CASE(test_domains_cache_diamond) { + std::stringstream ss(R"""( +class City + name ~ string + +class School + location ~ City + +class Practice + location ~ City + +class Physician + practice ~ Practice + school ~ School +)"""); + PCleanSchema schema; + assert(read_schema(ss, &schema)); + PCleanSchemaHelper schema_helper(schema); + + std::vector expected_domains = { + "Physician", "practice:Practice", "practice:location:City", + "school:School", "school:location:City"}; + BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); +} + BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { PCleanSchemaHelper schema_helper(schema); T_schema tschema = schema_helper.make_hirm_schema(); From 5cea10e97189f2dbf1042a787d4e58fbb76abcd1 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 20:33:44 +0000 Subject: [PATCH 4/6] Add annotated_domains and use it to reorder the domains of noisy_relations --- cxx/pclean/schema_helper.cc | 52 ++++++++++++++++++++++++++------ cxx/pclean/schema_helper.hh | 6 ++-- cxx/pclean/schema_helper_test.cc | 36 ++++++++++++++++++---- 3 files changed, 76 insertions(+), 18 deletions(-) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 7b56b38..cd45f45 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -15,29 +15,34 @@ void PCleanSchemaHelper::compute_class_name_cache() { void PCleanSchemaHelper::compute_domains_cache() { for (const auto& c: schema.classes) { if (!domains.contains(c.name)) { - domains[c.name] = compute_domains_for(c.name); + compute_domains_for(c.name); } } } -std::vector PCleanSchemaHelper::compute_domains_for( - const std::string& name) { +void PCleanSchemaHelper::compute_domains_for(const std::string& name) { std::vector ds; + std::vector annotated_ds; ds.push_back(name); + annotated_ds.push_back(name); PCleanClass c = get_class_by_name(name); for (const auto& v: c.vars) { if (const ClassVar* cv = std::get_if(&(v.spec))) { if (!domains.contains(cv->class_name)) { - domains[cv->class_name] = compute_domains_for(cv->class_name); + compute_domains_for(cv->class_name); } for (const std::string& s : domains[cv->class_name]) { - ds.push_back(v.name + ':' + s); + ds.push_back(s); + } + for (const std::string& s : annotated_domains[cv->class_name]) { + annotated_ds.push_back(v.name + ':' + s); } } } - return ds; + domains[name] = ds; + annotated_domains[name] = annotated_ds; } PCleanClass PCleanSchemaHelper::get_class_by_name(const std::string& name) { @@ -47,7 +52,8 @@ PCleanClass PCleanSchemaHelper::get_class_by_name(const std::string& name) { PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( const PCleanClass& base_class, std::vector::const_iterator path_iterator, - std::string* final_class_name) { + std::string* final_class_name, + std::string* path_prefix) { const std::string& s = *path_iterator; for (const PCleanVariable& v : base_class.vars) { if (v.name == s) { @@ -55,10 +61,11 @@ PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( *final_class_name = base_class.name; return v; } + path_prefix->append(v.name + ":"); const PCleanClass& next_class = get_class_by_name( std::get(v.spec).class_name); PCleanVariable sv = get_scalarvar_from_path( - next_class, ++path_iterator, final_class_name); + next_class, ++path_iterator, final_class_name, path_prefix); return sv; } } @@ -67,6 +74,26 @@ PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( assert(false); } +// Returns original_domains, but with the elements corresponding to +// annotated_ds elements that start with prefix moved to the front. +std::vector reorder_domains( + const std::vector& original_domains, + const std::vector& annotated_ds, + const std::string& prefix) { + std::vector output_domains; + for (size_t i = 0; i < original_domains.size(); ++i) { + if (annotated_ds[i].starts_with(prefix)) { + output_domains.push_back(original_domains[i]); + } + } + for (size_t i = 0; i < original_domains.size(); ++i) { + if (!annotated_ds[i].starts_with(prefix)) { + output_domains.push_back(original_domains[i]); + } + } + return output_domains; +} + T_schema PCleanSchemaHelper::make_hirm_schema() { T_schema tschema; for (const auto& c : schema.classes) { @@ -81,11 +108,16 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { const PCleanClass query_class = get_class_by_name(schema.query.record_class); for (const auto& f : schema.query.fields) { std::string final_class_name; + std::string path_prefix; const PCleanVariable sv = get_scalarvar_from_path( - query_class, f.class_path.cbegin(), &final_class_name); + query_class, f.class_path.cbegin(), &final_class_name, &path_prefix); std::string base_relation = final_class_name + ':' + sv.name; + std::vector reordered_domains = reorder_domains( + domains[query_class.name], + annotated_domains[query_class.name], + path_prefix); tschema[f.name] = get_emission_relation( - std::get(sv.spec), domains[query_class.name], base_relation); + std::get(sv.spec), reordered_domains, base_relation); } return tschema; diff --git a/cxx/pclean/schema_helper.hh b/cxx/pclean/schema_helper.hh index b7529cb..11a750d 100644 --- a/cxx/pclean/schema_helper.hh +++ b/cxx/pclean/schema_helper.hh @@ -25,14 +25,16 @@ class PCleanSchemaHelper { void compute_class_name_cache(); void compute_domains_cache(); - std::vector compute_domains_for(const std::string& name); + void compute_domains_for(const std::string& name); PCleanVariable get_scalarvar_from_path( const PCleanClass& base_class, std::vector::const_iterator path_iterator, - std::string* final_class_name); + std::string* final_class_name, + std::string* path_prefix); PCleanSchema schema; std::map class_name_to_index; std::map> domains; + std::map> annotated_domains; }; diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index 65fc1a9..747d060 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -58,21 +58,32 @@ BOOST_AUTO_TEST_CASE(test_domains_cache) { PCleanSchemaHelper schema_helper(schema); std::vector expected_domains = {"School"}; + std::vector expected_annotated_domains = {"School"}; BOOST_TEST(schema_helper.domains["School"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["School"] == expected_annotated_domains); - expected_domains = {"Physician", "school:School"}; + expected_domains = {"Physician", "School"}; + expected_annotated_domains = {"Physician", "school:School"}; BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Physician"] == expected_annotated_domains); expected_domains = {"City"}; + expected_annotated_domains = {"City"}; BOOST_TEST(schema_helper.domains["City"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["City"] == expected_annotated_domains); - expected_domains = {"Practice", "city:City"}; + expected_domains = {"Practice", "City"}; + expected_annotated_domains = {"Practice", "city:City"}; BOOST_TEST(schema_helper.domains["Practice"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Practice"] == expected_annotated_domains); expected_domains = { + "Record", "Physician", "School", "Practice", "City"}; + expected_annotated_domains = { "Record", "physician:Physician", "physician:school:School", "location:Practice", "location:city:City"}; BOOST_TEST(schema_helper.domains["Record"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Record"] == expected_annotated_domains); } BOOST_AUTO_TEST_CASE(test_domains_cache_two_paths_same_source) { @@ -89,8 +100,11 @@ class Person PCleanSchemaHelper schema_helper(schema); std::vector expected_domains = { + "Person", "City", "City"}; + std::vector expected_annotated_domains = { "Person", "birth_city:City", "home_city:City"}; BOOST_TEST(schema_helper.domains["Person"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Person"] == expected_annotated_domains); } BOOST_AUTO_TEST_CASE(test_domains_cache_diamond) { @@ -113,9 +127,12 @@ class Physician PCleanSchemaHelper schema_helper(schema); std::vector expected_domains = { + "Physician", "Practice", "City", "School", "City"}; + std::vector expected_annotated_domains = { "Physician", "practice:Practice", "practice:location:City", "school:School", "school:location:City"}; BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Physician"] == expected_annotated_domains); } BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { @@ -138,7 +155,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("Physician:degree")); T_clean_relation cr3 = std::get(tschema["Physician:degree"]); BOOST_TEST((cr3.distribution_spec.distribution == DistributionEnum::stringcat)); - std::vector expected_domains2 = {"Physician", "school:School"}; + std::vector expected_domains2 = {"Physician", "School"}; BOOST_TEST(cr3.domains == expected_domains2); BOOST_TEST(tschema.contains("Physician:specialty")); @@ -154,33 +171,40 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { T_noisy_relation nr1 = std::get(tschema["Specialty"]); BOOST_TEST(!nr1.is_observed); BOOST_TEST((nr1.emission_spec.emission == EmissionEnum::bigram_string)); - expected_domains = { - "Record", "physician:Physician", "physician:school:School", - "location:Practice", "location:city:City"}; + // "Physician", "School" moved to the front of the list. + expected_domains = {"Physician", "School", "Record", "Practice", "City"}; BOOST_TEST(nr1.domains == expected_domains); BOOST_TEST(tschema.contains("School")); T_noisy_relation nr2 = std::get(tschema["School"]); BOOST_TEST(!nr2.is_observed); BOOST_TEST((nr2.emission_spec.emission == EmissionEnum::bigram_string)); + // "School" moved to the front of the list. + expected_domains = {"School", "Record", "Physician", "Practice", "City"}; BOOST_TEST(nr2.domains == expected_domains); BOOST_TEST(tschema.contains("Degree")); T_noisy_relation nr3 = std::get(tschema["Degree"]); BOOST_TEST(!nr3.is_observed); BOOST_TEST((nr3.emission_spec.emission == EmissionEnum::bigram_string)); + // "Physician", "School" moved to the front of the list. + expected_domains = {"Physician", "School", "Record", "Practice", "City"}; BOOST_TEST(nr3.domains == expected_domains); BOOST_TEST(tschema.contains("City")); T_noisy_relation nr4 = std::get(tschema["City"]); BOOST_TEST(!nr4.is_observed); BOOST_TEST((nr4.emission_spec.emission == EmissionEnum::bigram_string)); + // "City" moved to the front of the list. + expected_domains = {"City", "Record", "Physician", "School", "Practice"}; BOOST_TEST(nr4.domains == expected_domains); BOOST_TEST(tschema.contains("State")); T_noisy_relation nr5 = std::get(tschema["State"]); BOOST_TEST(!nr5.is_observed); BOOST_TEST((nr5.emission_spec.emission == EmissionEnum::bigram_string)); + // "City" moved to the front of the list. + expected_domains = {"City", "Record", "Physician", "School", "Practice"}; BOOST_TEST(nr5.domains == expected_domains); } From 595cf252c44345f4259c26302575016fffad227e Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Thu, 1 Aug 2024 14:32:03 +0000 Subject: [PATCH 5/6] Add unit test for reorder_domains --- cxx/pclean/schema_helper.cc | 2 -- cxx/pclean/schema_helper.hh | 7 +++++++ cxx/pclean/schema_helper_test.cc | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index cd45f45..6c4a3a8 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -74,8 +74,6 @@ PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( assert(false); } -// Returns original_domains, but with the elements corresponding to -// annotated_ds elements that start with prefix moved to the front. std::vector reorder_domains( const std::vector& original_domains, const std::vector& annotated_ds, diff --git a/cxx/pclean/schema_helper.hh b/cxx/pclean/schema_helper.hh index 11a750d..6e13fe3 100644 --- a/cxx/pclean/schema_helper.hh +++ b/cxx/pclean/schema_helper.hh @@ -38,3 +38,10 @@ class PCleanSchemaHelper { std::map> domains; std::map> annotated_domains; }; + +// Returns original_domains, but with the elements corresponding to +// annotated_ds elements that start with prefix moved to the front. +std::vector reorder_domains( + const std::vector& original_domains, + const std::vector& annotated_ds, + const std::string& prefix); diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index 747d060..a6a7291 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -208,4 +208,29 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(nr5.domains == expected_domains); } +BOOST_AUTO_TEST_CASE(test_reorder_domains) { + std::vector origs = {"0", "1", "2", "3", "4", "5", "6", "7"}; + std::vector annotated = { + "000", "001", "010", "011", "100", "101", "110", "111"}; + + std::vector expected = { + "6", "7", "0", "1", "2", "3", "4", "5"}; + BOOST_TEST(reorder_domains(origs, annotated, "11") == expected); + + expected = {"4", "5", "6", "7", "0", "1", "2", "3"}; + BOOST_TEST(reorder_domains(origs, annotated, "1") == expected); + + origs = { + "republic_of_ireland", "northern_ireland", "england", "scotland", "wales"}; + annotated = { + "ireland:republic_of_ireland", + "ireland:uk:northern_ireland", + "great_britain:uk:england", + "great_britain:uk:scotland", + "great_britain:uk:wales"}; + expected = { + "northern_ireland", "republic_of_ireland", "england", "scotland", "wales"}; + BOOST_TEST(reorder_domains(origs, annotated, "ireland:uk:") == expected); +} + BOOST_AUTO_TEST_SUITE_END() From ab537a1262c3c80be6c3636a7d3b9876de605eb4 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Thu, 1 Aug 2024 17:01:22 +0000 Subject: [PATCH 6/6] Add comment explaining how we move the base relation's domains to the front --- cxx/pclean/schema_helper.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 6c4a3a8..0077985 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -110,6 +110,12 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { const PCleanVariable sv = get_scalarvar_from_path( query_class, f.class_path.cbegin(), &final_class_name, &path_prefix); std::string base_relation = final_class_name + ':' + sv.name; + // If the base relation has n domains, we need the first n domains + // of this emission relation to be exactly the same (including order). + // The base relation's annotated_domains are exactly those that start + // with the path_prefix constructed above, and we use the fact that the + // domains and annotated_domains are in one-to-one correspondence to + // move the base relation's domains to the front. std::vector reordered_domains = reorder_domains( domains[query_class.name], annotated_domains[query_class.name],