diff --git a/cxx/pclean/io.cc b/cxx/pclean/io.cc index aeeb398..ddff934 100644 --- a/cxx/pclean/io.cc +++ b/cxx/pclean/io.cc @@ -133,11 +133,11 @@ bool read_query(std::istream& is, PCleanQuery* query) { printf("Expected exactly two tokens on query from line %s", line.c_str()); return false; } - if (!query->base_class.empty()) { + if (!query->record_class.empty()) { printf("Expected exactly one `from` clause in query.\n"); return false; } - query->base_class = tokens[1].val; + query->record_class = tokens[1].val; continue; } @@ -209,7 +209,7 @@ bool read_schema(std::istream& is, PCleanSchema* schema) { } if (tokens[0].val == "observe") { - if (!schema->query.base_class.empty()) { + if (!schema->query.record_class.empty()) { printf("Error reading schema line %s: only one query is allowed\n", line.c_str()); return false; diff --git a/cxx/pclean/io_test.cc b/cxx/pclean/io_test.cc index 3cc03be..45d30d1 100644 --- a/cxx/pclean/io_test.cc +++ b/cxx/pclean/io_test.cc @@ -41,7 +41,7 @@ observe )"""); PCleanSchema schema; BOOST_TEST(read_schema(ss, &schema)); - BOOST_TEST(schema.query.base_class == "Record"); + BOOST_TEST(schema.query.record_class == "Record"); BOOST_TEST(schema.query.fields.size() == 5); BOOST_TEST(schema.query.fields[0].name == "Specialty"); diff --git a/cxx/pclean/schema.hh b/cxx/pclean/schema.hh index 1186ed3..022738b 100644 --- a/cxx/pclean/schema.hh +++ b/cxx/pclean/schema.hh @@ -38,7 +38,7 @@ struct QueryField { }; struct PCleanQuery { - std::string base_class; + std::string record_class; std::vector fields; }; diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index 85ef91d..0077985 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -3,7 +3,7 @@ PCleanSchemaHelper::PCleanSchemaHelper(const PCleanSchema& s): schema(s) { compute_class_name_cache(); - compute_ancestors_cache(); + compute_domains_cache(); } void PCleanSchemaHelper::compute_class_name_cache() { @@ -12,63 +12,84 @@ void PCleanSchemaHelper::compute_class_name_cache() { } } -void PCleanSchemaHelper::compute_ancestors_cache() { +void PCleanSchemaHelper::compute_domains_cache() { for (const auto& c: schema.classes) { - if (!ancestors.contains(c.name)) { - ancestors[c.name] = compute_ancestors_for(c.name); + if (!domains.contains(c.name)) { + compute_domains_for(c.name); } } } -std::set PCleanSchemaHelper::compute_ancestors_for( - const std::string& name) { - std::set ancs; - std::set parents = get_parent_classes(name); - for (const std::string& p: parents) { - ancs.insert(p); - if (!ancestors.contains(p)) { - ancestors[p] = compute_ancestors_for(p); +void PCleanSchemaHelper::compute_domains_for(const std::string& name) { + std::vector ds; + std::vector annotated_ds; + ds.push_back(name); + annotated_ds.push_back(name); + PCleanClass c = get_class_by_name(name); + + for (const auto& v: c.vars) { + if (const ClassVar* cv = std::get_if(&(v.spec))) { + if (!domains.contains(cv->class_name)) { + compute_domains_for(cv->class_name); + } + for (const std::string& s : domains[cv->class_name]) { + ds.push_back(s); + } + for (const std::string& s : annotated_domains[cv->class_name]) { + annotated_ds.push_back(v.name + ':' + s); + } } - ancs.insert(ancestors[p].cbegin(), ancestors[p].cend()); } - return ancs; + + domains[name] = ds; + annotated_domains[name] = annotated_ds; } PCleanClass PCleanSchemaHelper::get_class_by_name(const std::string& name) { return schema.classes[class_name_to_index[name]]; } -std::set PCleanSchemaHelper::get_parent_classes( - const std::string& name) { - std::set parents; - PCleanClass c = get_class_by_name(name); - for (const auto& v: c.vars) { - if (const ClassVar* cv = std::get_if(&(v.spec))) { - parents.insert(cv->class_name); +PCleanVariable PCleanSchemaHelper::get_scalarvar_from_path( + const PCleanClass& base_class, + std::vector::const_iterator path_iterator, + std::string* final_class_name, + std::string* path_prefix) { + const std::string& s = *path_iterator; + for (const PCleanVariable& v : base_class.vars) { + if (v.name == s) { + if (std::holds_alternative(v.spec)) { + *final_class_name = base_class.name; + return v; + } + path_prefix->append(v.name + ":"); + const PCleanClass& next_class = get_class_by_name( + std::get(v.spec).class_name); + PCleanVariable sv = get_scalarvar_from_path( + next_class, ++path_iterator, final_class_name, path_prefix); + return sv; } } - return parents; + printf("Error: could not find name %s in class %s\n", + s.c_str(), base_class.name.c_str()); + assert(false); } -std::set PCleanSchemaHelper::get_ancestor_classes( - const std::string& name) { - return ancestors[name]; -} - -std::string get_base_relation_name( - const PCleanClass& c, const std::vector& field_path) { - assert(field_path.size() == 2); - const std::string& class_var = field_path[0]; - const std::string& var_name = field_path[1]; - std::string class_name = ""; - for (const auto& v : c.vars) { - if (v.name == class_var) { - class_name = std::get(v.spec).class_name; - break; +std::vector reorder_domains( + const std::vector& original_domains, + const std::vector& annotated_ds, + const std::string& prefix) { + std::vector output_domains; + for (size_t i = 0; i < original_domains.size(); ++i) { + if (annotated_ds[i].starts_with(prefix)) { + output_domains.push_back(original_domains[i]); } } - assert(class_name != ""); - return class_name + ':' + var_name; + for (size_t i = 0; i < original_domains.size(); ++i) { + if (!annotated_ds[i].starts_with(prefix)) { + output_domains.push_back(original_domains[i]); + } + } + return output_domains; } T_schema PCleanSchemaHelper::make_hirm_schema() { @@ -77,16 +98,31 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { for (const auto& v : c.vars) { std::string rel_name = c.name + ':' + v.name; if (const ScalarVar* dv = std::get_if(&(v.spec))) { - std::vector domains; - domains.push_back(c.name); - for (const std::string& sc : get_ancestor_classes(c.name)) { - domains.push_back(sc); - } - tschema[rel_name] = get_distribution_relation(*dv, domains); + tschema[rel_name] = get_distribution_relation(*dv, domains[c.name]); } - // TODO(thomaswc): If this class isn't the observation class, - // create additional noisy relations. } } + + const PCleanClass query_class = get_class_by_name(schema.query.record_class); + for (const auto& f : schema.query.fields) { + std::string final_class_name; + std::string path_prefix; + const PCleanVariable sv = get_scalarvar_from_path( + query_class, f.class_path.cbegin(), &final_class_name, &path_prefix); + std::string base_relation = final_class_name + ':' + sv.name; + // If the base relation has n domains, we need the first n domains + // of this emission relation to be exactly the same (including order). + // The base relation's annotated_domains are exactly those that start + // with the path_prefix constructed above, and we use the fact that the + // domains and annotated_domains are in one-to-one correspondence to + // move the base relation's domains to the front. + std::vector reordered_domains = reorder_domains( + domains[query_class.name], + annotated_domains[query_class.name], + path_prefix); + tschema[f.name] = get_emission_relation( + std::get(sv.spec), reordered_domains, base_relation); + } + return tschema; } diff --git a/cxx/pclean/schema_helper.hh b/cxx/pclean/schema_helper.hh index 8a13bb0..6e13fe3 100644 --- a/cxx/pclean/schema_helper.hh +++ b/cxx/pclean/schema_helper.hh @@ -17,23 +17,31 @@ class PCleanSchemaHelper { PCleanClass get_class_by_name(const std::string& name); - // The parent classes of a class are those that are referred to by a - // ClassVar inside the class. - std::set get_parent_classes(const std::string& name); - // The ancestors of a class are the transitive closure of the parent - // relationship. - std::set get_ancestor_classes(const std::string& name); - // The source classes of a class are its ancestors without parents. - std::set get_source_classes(const std::string& name); - T_schema make_hirm_schema(); - private: + // The rest of these methods are conceptually private, but actually + // public for testing. + void compute_class_name_cache(); - void compute_ancestors_cache(); - std::set compute_ancestors_for(const std::string& name); + void compute_domains_cache(); + + void compute_domains_for(const std::string& name); + + PCleanVariable get_scalarvar_from_path( + const PCleanClass& base_class, + std::vector::const_iterator path_iterator, + std::string* final_class_name, + std::string* path_prefix); PCleanSchema schema; std::map class_name_to_index; - std::map> ancestors; + std::map> domains; + std::map> annotated_domains; }; + +// Returns original_domains, but with the elements corresponding to +// annotated_ds elements that start with prefix moved to the front. +std::vector reorder_domains( + const std::vector& original_domains, + const std::vector& annotated_ds, + const std::string& prefix); diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index 0211412..a6a7291 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -33,7 +33,7 @@ class Record observe physician.specialty as Specialty physician.school.name as School - physician.observed_degree as Degree + physician.degree as Degree location.city.name as City location.city.state as State from Record @@ -54,22 +54,85 @@ BOOST_AUTO_TEST_CASE(test_get_class_by_name) { BOOST_TEST(c.name == "Practice"); } -BOOST_AUTO_TEST_CASE(test_get_parent_classes) { +BOOST_AUTO_TEST_CASE(test_domains_cache) { PCleanSchemaHelper schema_helper(schema); - BOOST_TEST(schema_helper.get_parent_classes("School").empty()); - BOOST_TEST(schema_helper.get_parent_classes("City").empty()); - BOOST_TEST(schema_helper.get_parent_classes("Physician").size() == 1); - BOOST_TEST(schema_helper.get_parent_classes("Practice").size() == 1); - BOOST_TEST(schema_helper.get_parent_classes("Record").size() == 2); + + std::vector expected_domains = {"School"}; + std::vector expected_annotated_domains = {"School"}; + BOOST_TEST(schema_helper.domains["School"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["School"] == expected_annotated_domains); + + expected_domains = {"Physician", "School"}; + expected_annotated_domains = {"Physician", "school:School"}; + BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Physician"] == expected_annotated_domains); + + expected_domains = {"City"}; + expected_annotated_domains = {"City"}; + BOOST_TEST(schema_helper.domains["City"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["City"] == expected_annotated_domains); + + expected_domains = {"Practice", "City"}; + expected_annotated_domains = {"Practice", "city:City"}; + BOOST_TEST(schema_helper.domains["Practice"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Practice"] == expected_annotated_domains); + + expected_domains = { + "Record", "Physician", "School", "Practice", "City"}; + expected_annotated_domains = { + "Record", "physician:Physician", "physician:school:School", + "location:Practice", "location:city:City"}; + BOOST_TEST(schema_helper.domains["Record"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Record"] == expected_annotated_domains); } -BOOST_AUTO_TEST_CASE(test_get_ancestor_classes) { +BOOST_AUTO_TEST_CASE(test_domains_cache_two_paths_same_source) { + std::stringstream ss(R"""( +class City + name ~ string + +class Person + birth_city ~ City + home_city ~ City +)"""); + PCleanSchema schema; + assert(read_schema(ss, &schema)); + PCleanSchemaHelper schema_helper(schema); + + std::vector expected_domains = { + "Person", "City", "City"}; + std::vector expected_annotated_domains = { + "Person", "birth_city:City", "home_city:City"}; + BOOST_TEST(schema_helper.domains["Person"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Person"] == expected_annotated_domains); +} + +BOOST_AUTO_TEST_CASE(test_domains_cache_diamond) { + std::stringstream ss(R"""( +class City + name ~ string + +class School + location ~ City + +class Practice + location ~ City + +class Physician + practice ~ Practice + school ~ School +)"""); + PCleanSchema schema; + assert(read_schema(ss, &schema)); PCleanSchemaHelper schema_helper(schema); - BOOST_TEST(schema_helper.get_ancestor_classes("School").empty()); - BOOST_TEST(schema_helper.get_ancestor_classes("City").empty()); - BOOST_TEST(schema_helper.get_ancestor_classes("Physician").size() == 1); - BOOST_TEST(schema_helper.get_ancestor_classes("Practice").size() == 1); - BOOST_TEST(schema_helper.get_ancestor_classes("Record").size() == 4); + + std::vector expected_domains = { + "Physician", "Practice", "City", "School", "City"}; + std::vector expected_annotated_domains = { + "Physician", "practice:Practice", "practice:location:City", + "school:School", "school:location:City"}; + BOOST_TEST(schema_helper.domains["Physician"] == expected_domains); + BOOST_TEST(schema_helper.annotated_domains["Physician"] == expected_annotated_domains); } BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { @@ -103,6 +166,71 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(cr4.domains == expected_domains3); BOOST_TEST(tschema.contains("City:state")); + + BOOST_TEST(tschema.contains("Specialty")); + T_noisy_relation nr1 = std::get(tschema["Specialty"]); + BOOST_TEST(!nr1.is_observed); + BOOST_TEST((nr1.emission_spec.emission == EmissionEnum::bigram_string)); + // "Physician", "School" moved to the front of the list. + expected_domains = {"Physician", "School", "Record", "Practice", "City"}; + BOOST_TEST(nr1.domains == expected_domains); + + BOOST_TEST(tschema.contains("School")); + T_noisy_relation nr2 = std::get(tschema["School"]); + BOOST_TEST(!nr2.is_observed); + BOOST_TEST((nr2.emission_spec.emission == EmissionEnum::bigram_string)); + // "School" moved to the front of the list. + expected_domains = {"School", "Record", "Physician", "Practice", "City"}; + BOOST_TEST(nr2.domains == expected_domains); + + BOOST_TEST(tschema.contains("Degree")); + T_noisy_relation nr3 = std::get(tschema["Degree"]); + BOOST_TEST(!nr3.is_observed); + BOOST_TEST((nr3.emission_spec.emission == EmissionEnum::bigram_string)); + // "Physician", "School" moved to the front of the list. + expected_domains = {"Physician", "School", "Record", "Practice", "City"}; + BOOST_TEST(nr3.domains == expected_domains); + + BOOST_TEST(tschema.contains("City")); + T_noisy_relation nr4 = std::get(tschema["City"]); + BOOST_TEST(!nr4.is_observed); + BOOST_TEST((nr4.emission_spec.emission == EmissionEnum::bigram_string)); + // "City" moved to the front of the list. + expected_domains = {"City", "Record", "Physician", "School", "Practice"}; + BOOST_TEST(nr4.domains == expected_domains); + + BOOST_TEST(tschema.contains("State")); + T_noisy_relation nr5 = std::get(tschema["State"]); + BOOST_TEST(!nr5.is_observed); + BOOST_TEST((nr5.emission_spec.emission == EmissionEnum::bigram_string)); + // "City" moved to the front of the list. + expected_domains = {"City", "Record", "Physician", "School", "Practice"}; + BOOST_TEST(nr5.domains == expected_domains); +} + +BOOST_AUTO_TEST_CASE(test_reorder_domains) { + std::vector origs = {"0", "1", "2", "3", "4", "5", "6", "7"}; + std::vector annotated = { + "000", "001", "010", "011", "100", "101", "110", "111"}; + + std::vector expected = { + "6", "7", "0", "1", "2", "3", "4", "5"}; + BOOST_TEST(reorder_domains(origs, annotated, "11") == expected); + + expected = {"4", "5", "6", "7", "0", "1", "2", "3"}; + BOOST_TEST(reorder_domains(origs, annotated, "1") == expected); + + origs = { + "republic_of_ireland", "northern_ireland", "england", "scotland", "wales"}; + annotated = { + "ireland:republic_of_ireland", + "ireland:uk:northern_ireland", + "great_britain:uk:england", + "great_britain:uk:scotland", + "great_britain:uk:wales"}; + expected = { + "northern_ireland", "republic_of_ireland", "england", "scotland", "wales"}; + BOOST_TEST(reorder_domains(origs, annotated, "ireland:uk:") == expected); } BOOST_AUTO_TEST_SUITE_END()