From 1484d8761159f54b11f689caaf826e771e53e854 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 18:50:54 +0000 Subject: [PATCH 01/14] Load and incorporate observations in pclean main --- cxx/pclean/BUILD | 2 ++ cxx/pclean/pclean.cc | 50 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/cxx/pclean/BUILD b/cxx/pclean/BUILD index be61340..980282a 100644 --- a/cxx/pclean/BUILD +++ b/cxx/pclean/BUILD @@ -59,12 +59,14 @@ cc_binary( name = "pclean", srcs = ["pclean.cc"], deps = [ + ":csv", ":io", ":schema", ":schema_helper", "//:cxxopts", "//:hirm_lib", "//:inference", + "//:util_io", ], ) diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc index 99fc374..ab5d541 100644 --- a/cxx/pclean/pclean.cc +++ b/cxx/pclean/pclean.cc @@ -8,12 +8,42 @@ #include #include "cxxopts.hpp" +#include "irm.hh" #include "hirm.hh" #include "inference.hh" +#include "util_io.hh" +#include "pclean/csv.hh" #include "pclean/io.hh" #include "pclean/schema.hh" #include "pclean/schema_helper.hh" +T_observations translate_observations( + const DataFrame& df, const T_schema &schema) { + T_observations obs; + + for (const auto& col : df.data) { + const std::string& col_name = col.first; + const T_relation trel = schema[col_name]; + size_t num_domains; + std::visit([&](const auto &r) { + num_domains = r.domains.size(); + }, trel); + + for (size_t i = 0; i < col.second.size(); ++i) { + const std::string& val = col.second[i]; + std::vector entities; + for (size_t j = 0; j < num_domains; ++j) { + // Assume that each row of the dataframe is its own entity, *and* + // that all of its ancestor entities are distinct from those of any + // other entity. + entities.push_back(std::to_string(i)); + } + obs[col_name].push_back(std::make_tuple(entities, val)); + } + } + return obs; +} + int main(int argc, char** argv) { cxxopts::Options options( "pclean", "Run HIRM from a PClean schema"); @@ -48,25 +78,41 @@ int main(int argc, char** argv) { // Read schema PCleanSchema pclean_schema; std::string schema_fn = result["schema"].as(); + std::cout << "Reading schema file from " << schema_fn << "\n"; if (!read_schema_file(schema_fn, &pclean_schema)) { std::cout << "Error reading schema file" << schema_fn << "\n"; } // Translate schema + std::cout << "Translating schema ...\n"; PCleanSchemaHelper schema_helper(pclean_schema); T_schema hirm_schema = schema_helper.make_hirm_schema(); // Read observations std::string obs_fn = result["obs"].as(); - // TODO(thomaswc): This + std::cout << "Reading observations file from " << obs_fn << "\n"; + DataFrame df = DataFrame::from_csv(obs_fn); + + // Validate that we have a relation for each observation column. + for (const auto &col : df.data) { + if (!hirm_schema.contains(col.first)) { + printf("Error, could not find HIRM relation for column %s\n", + col.first.c_str()); + assert(false); + } + } // Create model HIRM hirm(hirm_schema, &prng); // Incorporate observations. - // TODO(thomaswc): This + std::cout << "Incorporating observations ...\n"; + T_observations observations = translate_observations(df, hirm_schema); + T_encoding encoding = encode_observations(hirm_schema, observations); + incorporate_observations(&prng, &hirm, encoding, observations); // Run inference + std::cout << "Running inference ...\n"; inference_hirm(&prng, &hirm, result["iters"].as(), result["timeout"].as(), From ced702a166ceaf6c0383fdadd24e34e81c57592f Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 31 Jul 2024 18:59:36 +0000 Subject: [PATCH 02/14] Fix build error --- cxx/pclean/pclean.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc index ab5d541..66d38aa 100644 --- a/cxx/pclean/pclean.cc +++ b/cxx/pclean/pclean.cc @@ -23,7 +23,7 @@ T_observations translate_observations( for (const auto& col : df.data) { const std::string& col_name = col.first; - const T_relation trel = schema[col_name]; + const T_relation& trel = schema.at(col_name); size_t num_domains; std::visit([&](const auto &r) { num_domains = r.domains.size(); From 81791f6a6a820aecbfdbd629fd26dee6ab55e578 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Thu, 1 Aug 2024 15:58:36 +0000 Subject: [PATCH 03/14] Don't incorporate missing values. --- cxx/pclean/pclean.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc index 66d38aa..7a74aa4 100644 --- a/cxx/pclean/pclean.cc +++ b/cxx/pclean/pclean.cc @@ -31,6 +31,13 @@ T_observations translate_observations( for (size_t i = 0; i < col.second.size(); ++i) { const std::string& val = col.second[i]; + if (val.empty()) { + // Don't incorporate missing values. + // TODO(thomaswc): Allow the user to specify other values that mean + // missing data. ("missing", "NA", "nan", etc.). + continue; + } + std::vector entities; for (size_t j = 0; j < num_domains; ++j) { // Assume that each row of the dataframe is its own entity, *and* From bb58176d15d7794526b2df0186896e2337c2aa89 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Mon, 5 Aug 2024 21:13:03 +0000 Subject: [PATCH 04/14] Add test for translate_observations --- cxx/pclean/BUILD | 21 +++++++++++++ cxx/pclean/pclean.cc | 35 +-------------------- cxx/pclean/pclean_lib.cc | 42 ++++++++++++++++++++++++++ cxx/pclean/pclean_lib.hh | 16 ++++++++++ cxx/pclean/pclean_lib_test.cc | 57 +++++++++++++++++++++++++++++++++++ 5 files changed, 137 insertions(+), 34 deletions(-) create mode 100644 cxx/pclean/pclean_lib.cc create mode 100644 cxx/pclean/pclean_lib.hh create mode 100644 cxx/pclean/pclean_lib_test.cc diff --git a/cxx/pclean/BUILD b/cxx/pclean/BUILD index 980282a..d49eff7 100644 --- a/cxx/pclean/BUILD +++ b/cxx/pclean/BUILD @@ -61,6 +61,7 @@ cc_binary( deps = [ ":csv", ":io", + ":pclean_lib", ":schema", ":schema_helper", "//:cxxopts", @@ -70,6 +71,26 @@ cc_binary( ], ) +cc_library( + name = "pclean_lib", + hdrs = ["pclean_lib.hh"], + srcs = ["pclean_lib.cc"], + deps = [ + ":csv", + "//:hirm_lib", + "//:util_io", + ], +) + +cc_test( + name = "pclean_lib_test", + srcs = ["pclean_lib_test.cc"], + deps = [ + ":pclean_lib", + "@boost//:test", + ], +) + cc_library( name = "schema", hdrs = ["schema.hh"], diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc index 81471de..d4d16af 100644 --- a/cxx/pclean/pclean.cc +++ b/cxx/pclean/pclean.cc @@ -14,43 +14,10 @@ #include "util_io.hh" #include "pclean/csv.hh" #include "pclean/io.hh" +#include "pclean/pclean_lib.hh" #include "pclean/schema.hh" #include "pclean/schema_helper.hh" -T_observations translate_observations( - const DataFrame& df, const T_schema &schema) { - T_observations obs; - - for (const auto& col : df.data) { - const std::string& col_name = col.first; - const T_relation& trel = schema.at(col_name); - size_t num_domains; - std::visit([&](const auto &r) { - num_domains = r.domains.size(); - }, trel); - - for (size_t i = 0; i < col.second.size(); ++i) { - const std::string& val = col.second[i]; - if (val.empty()) { - // Don't incorporate missing values. - // TODO(thomaswc): Allow the user to specify other values that mean - // missing data. ("missing", "NA", "nan", etc.). - continue; - } - - std::vector entities; - for (size_t j = 0; j < num_domains; ++j) { - // Assume that each row of the dataframe is its own entity, *and* - // that all of its ancestor entities are distinct from those of any - // other entity. - entities.push_back(std::to_string(i)); - } - obs[col_name].push_back(std::make_tuple(entities, val)); - } - } - return obs; -} - int main(int argc, char** argv) { cxxopts::Options options( "pclean", "Run HIRM from a PClean schema"); diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc new file mode 100644 index 0000000..6d0d8f0 --- /dev/null +++ b/cxx/pclean/pclean_lib.cc @@ -0,0 +1,42 @@ +// Copyright 2024 +// Apache License, Version 2.0, refer to LICENSE.txt + +#include "irm.hh" +#include "pclean/csv.hh" +#include "pclean/pclean_lib.hh" + +T_observations translate_observations( + const DataFrame& df, const T_schema &schema) { + T_observations obs; + int uniq = 0; + + for (const auto& col : df.data) { + const std::string& col_name = col.first; + if (!schema.contains(col_name)) { + printf("Schema does not contain %s, skipping ...\n", col_name.c_str()); + continue; + } + + const T_relation& trel = schema.at(col_name); + size_t num_domains = std::visit([&](const auto &r) { + return r.domains.size();}, trel); + + for (size_t i = 0; i < col.second.size(); ++i) { + const std::string& val = col.second[i]; + if (val.empty()) { + // Don't incorporate missing values. + // TODO(thomaswc): Allow the user to specify other values that mean + // missing data. ("missing", "NA", "nan", etc.). + continue; + } + + std::vector entities; + for (size_t j = 0; j < num_domains; ++j) { + // Give each entity in every domain its own unique value. + entities.push_back(std::to_string(uniq++)); + } + obs[col_name].push_back(std::make_tuple(entities, val)); + } + } + return obs; +} diff --git a/cxx/pclean/pclean_lib.hh b/cxx/pclean/pclean_lib.hh new file mode 100644 index 0000000..a3e04ba --- /dev/null +++ b/cxx/pclean/pclean_lib.hh @@ -0,0 +1,16 @@ +// Copyright 2024 +// Apache License, Version 2.0, refer to LICENSE.txt + +#pragma once + +#include "irm.hh" +#include "util_io.hh" +#include "pclean/csv.hh" +#include "pclean/pclean_lib.hh" + +// For each non-missing value in the DataFrame df, create an +// observation in the returned T_observations. The column name of the value +// is used as the relation name, and each entity in each domain is given +// its own unique value. +T_observations translate_observations( + const DataFrame& df, const T_schema &schema); diff --git a/cxx/pclean/pclean_lib_test.cc b/cxx/pclean/pclean_lib_test.cc new file mode 100644 index 0000000..d44f422 --- /dev/null +++ b/cxx/pclean/pclean_lib_test.cc @@ -0,0 +1,57 @@ +#define BOOST_TEST_MODULE test pclean_csv + +#include "pclean/pclean_lib.hh" +#include +#include +namespace tt = boost::test_tools; + +BOOST_AUTO_TEST_CASE(test_translate_observations) { + std::stringstream ss(R"""(Column1,Room Type,Monthly Rent,County,State +0,studio,,Mahoning County,OH +1,4br,2152.0,,NV +2,1br,1267.0,Gwinnett County, +)"""); + + DataFrame df = DataFrame::from_csv(ss); + + std::map state_params = {{"strings", "AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY"}}; + std::map br_params = {{"strings", "1br 2br 3br 4br studio"}}; + + T_schema schema = { + {"County:name", + T_clean_relation{{"County"}, false, DistributionSpec("bigram")}}, + {"County:state", + T_clean_relation{{"County"}, false, DistributionSpec("stringcat", state_params)}}, + {"Room Type", + T_clean_relation{{"Obs"}, false, DistributionSpec("stringcat", br_params)}}, + {"Monthly Rent", + T_clean_relation{{"Obs"}, false, DistributionSpec("normal")}}, + {"County", + T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:name"}}, + {"State", + T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:state"}}}; + + T_observations obs = translate_observations(df, schema); + + // Relations not corresponding to columns should be un-observed. + BOOST_TEST(!obs.contains("County:name")); + BOOST_TEST(!obs.contains("County:state")); + + BOOST_TEST(obs["Room Type"].size() == 3); + BOOST_TEST(obs["Monthly Rent"].size() == 2); + BOOST_TEST(obs["County"].size() == 2); + BOOST_TEST(obs["State"].size() == 2); + + BOOST_TEST(std::get<0>(obs["Room Type"][0]).size() == 1); + BOOST_TEST(std::get<1>(obs["Room Type"][0]) == "studio"); + + BOOST_TEST(std::get<0>(obs["Monthly Rent"][0]).size() == 1); + BOOST_TEST(std::get<1>(obs["Monthly Rent"][0]) == "2152.0"); + + BOOST_TEST(std::get<0>(obs["County"][0]).size() == 2); + BOOST_TEST(std::get<1>(obs["County"][0]) == "Mahoning County"); + + BOOST_TEST(std::get<0>(obs["State"][0]).size() == 2); + BOOST_TEST(std::get<1>(obs["State"][0]) == "OH"); +} + From 027b4840d346486037692a620e251febc5738b0d Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Mon, 5 Aug 2024 21:49:18 +0000 Subject: [PATCH 05/14] Add workaround to std::getline bug --- cxx/pclean/csv.cc | 11 ++++++++++- cxx/pclean/pclean_lib.cc | 7 ++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cxx/pclean/csv.cc b/cxx/pclean/csv.cc index 01bd166..049723e 100644 --- a/cxx/pclean/csv.cc +++ b/cxx/pclean/csv.cc @@ -53,7 +53,16 @@ DataFrame DataFrame::from_csv( df.data[col_names[i++]].push_back(part); } if (!first_line) { - assert(i == col_names.size()); + if (i != col_names.size()) { + if (line.back() == ',') { + // std::getline is broken and won't let the last field be empty. + df.data[col_names[i++]].push_back(""); + } else { + printf("Only found %ld out of %ld expected columns in line\n%s\n", + i, col_names.size(), line.c_str()); + assert(false); + } + } } first_line = false; } diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc index 6d0d8f0..59d40fa 100644 --- a/cxx/pclean/pclean_lib.cc +++ b/cxx/pclean/pclean_lib.cc @@ -8,7 +8,6 @@ T_observations translate_observations( const DataFrame& df, const T_schema &schema) { T_observations obs; - int uniq = 0; for (const auto& col : df.data) { const std::string& col_name = col.first; @@ -32,8 +31,10 @@ T_observations translate_observations( std::vector entities; for (size_t j = 0; j < num_domains; ++j) { - // Give each entity in every domain its own unique value. - entities.push_back(std::to_string(uniq++)); + // Give every row it's own universe of unique id's. + // TODO(thomaswc): Correctly handle the case when a row makes + // references to two or more different entities of the same type. + entities.push_back(std::to_string(i)); } obs[col_name].push_back(std::make_tuple(entities, val)); } From a0165380f830f2c22fa9b20c4c528e2aa22ea1de Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Tue, 6 Aug 2024 15:12:15 +0000 Subject: [PATCH 06/14] Prepend d to domain names in test for clarity --- cxx/pclean/pclean_lib_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cxx/pclean/pclean_lib_test.cc b/cxx/pclean/pclean_lib_test.cc index d44f422..8a61073 100644 --- a/cxx/pclean/pclean_lib_test.cc +++ b/cxx/pclean/pclean_lib_test.cc @@ -19,17 +19,17 @@ BOOST_AUTO_TEST_CASE(test_translate_observations) { T_schema schema = { {"County:name", - T_clean_relation{{"County"}, false, DistributionSpec("bigram")}}, + T_clean_relation{{"dCounty"}, false, DistributionSpec("bigram")}}, {"County:state", - T_clean_relation{{"County"}, false, DistributionSpec("stringcat", state_params)}}, + T_clean_relation{{"dCounty"}, false, DistributionSpec("stringcat", state_params)}}, {"Room Type", - T_clean_relation{{"Obs"}, false, DistributionSpec("stringcat", br_params)}}, + T_clean_relation{{"dObs"}, false, DistributionSpec("stringcat", br_params)}}, {"Monthly Rent", - T_clean_relation{{"Obs"}, false, DistributionSpec("normal")}}, + T_clean_relation{{"dObs"}, false, DistributionSpec("normal")}}, {"County", - T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:name"}}, + T_noisy_relation{{"dCounty", "dObs"}, false, EmissionSpec("bigram"), "County:name"}}, {"State", - T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:state"}}}; + T_noisy_relation{{"dCounty", "dObs"}, false, EmissionSpec("bigram"), "County:state"}}}; T_observations obs = translate_observations(df, schema); From c25bd277d96209bedead6dc6693968cc6aafa038 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Tue, 6 Aug 2024 20:43:20 +0000 Subject: [PATCH 07/14] Debugging stuff mostly --- cxx/assets/flights_dirty.100.csv | 100 ++++++++++++++++++++++++++++++ cxx/assets/hospital_dirty.100.csv | 100 ++++++++++++++++++++++++++++++ cxx/assets/rents_dirty.100.csv | 100 ++++++++++++++++++++++++++++++ cxx/emissions/bigram_string.cc | 5 ++ cxx/emissions/string_alignment.cc | 3 + cxx/hirm.cc | 3 + cxx/integration_tests.sh | 18 +++--- cxx/irm.cc | 2 + cxx/pclean/csv.cc | 20 +++--- cxx/pclean/pclean.cc | 13 +--- cxx/util_io.cc | 17 +++++ 11 files changed, 354 insertions(+), 27 deletions(-) create mode 100644 cxx/assets/flights_dirty.100.csv create mode 100644 cxx/assets/hospital_dirty.100.csv create mode 100644 cxx/assets/rents_dirty.100.csv diff --git a/cxx/assets/flights_dirty.100.csv b/cxx/assets/flights_dirty.100.csv new file mode 100644 index 0000000..c7e3b8b --- /dev/null +++ b/cxx/assets/flights_dirty.100.csv @@ -0,0 +1,100 @@ +tuple_id,src,flight,sched_dep_time,act_dep_time,sched_arr_time,act_arr_time +1,aa,AA-3859-IAH-ORD,7:10 a.m.,7:16 a.m.,9:40 a.m.,9:32 a.m. +2,aa,AA-1733-ORD-PHX,7:45 p.m.,7:58 p.m.,10:30 p.m., +3,aa,AA-1640-MIA-MCO,6:30 p.m.,,7:25 p.m., +4,aa,AA-518-MIA-JFK,6:40 a.m.,6:54 a.m.,9:25 a.m.,9:28 a.m. +5,aa,AA-3756-ORD-SLC,12:15 p.m.,12:41 p.m.,2:45 p.m.,2:50 p.m. +6,aa,AA-204-LAX-MCO,11:25 p.m.,,12/02/2011 6:55 a.m., +7,aa,AA-3468-CVG-MIA,7:00 a.m.,7:25 a.m.,9:55 a.m.,9:45 a.m. +8,aa,AA-484-DFW-MIA,4:15 p.m.,4:29 p.m.,7:55 p.m.,7:39 p.m. +9,aa,AA-446-DFW-PHL,11:50 a.m.,12:12 p.m.,3:50 p.m.,4:09 p.m. +10,aa,AA-466-IAH-MIA,6:00 a.m.,6:08 a.m.,9:20 a.m.,9:05 a.m. +11,aa,AA-1886-BOS-MIA,10:45 a.m.,10:55 a.m.,2:20 p.m.,1:40 p.m. +12,aa,AA-2957-DFW-CVG,7:55 a.m.,8:04 a.m.,11:05 a.m.,11:01 a.m. +13,aa,AA-1664-MIA-ATL,10:15 a.m.,10:18 a.m.,12:10 p.m.,11:56 a.m. +14,aa,AA-3979-CVG-ORD,7:30 a.m.,8:04 a.m.,8:00 a.m.,8:06 a.m. +15,aa,AA-1279-DFW-PHX,1:00 p.m.,2:04 p.m.,2:35 p.m.,3:30 p.m. +16,aa,AA-616-DFW-DTW,9:05 a.m.,10:10 a.m.,12:35 p.m.,1:27 p.m. +17,aa,AA-4344-ORD-DTW,11:45 a.m.,12:13 p.m.,2:00 p.m.,2:10 p.m. +18,aa,AA-2525-DFW-MIA,7:25 a.m.,7:39 a.m.,11:10 a.m.,11:06 a.m. +19,aa,AA-404-MIA-MCO,6:45 a.m.,6:57 a.m.,7:45 a.m.,7:40 a.m. +20,aa,AA-271-MIA-LAX,11:35 a.m.,11:52 a.m.,2:25 p.m.,2:27 p.m. +21,aa,AA-2050-ORD-MIA,10:40 a.m.,11:05 a.m.,2:45 p.m.,2:46 p.m. +22,aa,AA-4330-CVG-ORD,3:35 p.m.,3:36 p.m.,4:00 p.m.,3:33 p.m. +23,aa,AA-1434-DFW-MCO,7:15 a.m.,7:21 a.m.,10:35 a.m.,10:36 a.m. +24,aa,AA-4307-ORD-DTW,6:45 p.m.,6:55 p.m.,9:10 p.m.,8:53 p.m. +25,aa,AA-3-JFK-LAX,12:00 p.m.,12:11 p.m.,3:15 p.m.,3:16 p.m. +26,aa,AA-3842-MSP-ORD,6:00 a.m.,6:56 a.m.,7:30 a.m.,7:51 a.m. +27,aa,AA-643-MIA-ORD,7:10 a.m.,7:30 a.m.,9:35 a.m.,9:31 a.m. +28,aa,AA-1007-MIA-PHX,4:55 p.m.,5:08 p.m.,8:05 p.m.,7:55 p.m. +29,aa,AA-1221-MCO-ORD,8:00 p.m.,8:23 p.m.,9:45 p.m.,9:53 p.m. +30,aa,AA-400-DFW-JFK,10:40 a.m.,11:01 a.m.,2:59 p.m.,3:04 p.m. +31,aa,AA-4198-ORD-CLE,10:40 a.m.,10:54 a.m.,12:55 p.m.,12:50 p.m. +32,aa,AA-1522-SFO-ORD,11:55 p.m.,,12/02/2011 5:50 a.m., +33,aa,AA-3823-LAX-DEN,9:00 p.m.,,12/02/2011 12:15 a.m., +34,aa,AA-2312-DFW-DTW,8:25 p.m.,,11:50 p.m., +35,aa,AA-1165-JFK-MIA,6:59 a.m.,7:22 a.m.,10:34 a.m.,10:04 a.m. +36,aa,AA-431-MIA-SFO,8:35 a.m.,8:51 a.m.,11:50 a.m.,11:44 a.m. +37,aa,AA-649-ORD-SNA,1:30 p.m.,,3:50 p.m., +38,aa,AA-3063-SLC-LAX,8:20 p.m.,8:39 p.m.,9:20 p.m., +39,aa,AA-3804-PHL-ORD,2:35 p.m.,2:44 p.m.,4:05 p.m.,3:43 p.m. +40,aa,AA-1917-JFK-MCO,2:55 p.m.,3:27 p.m.,5:55 p.m.,5:43 p.m. +41,aa,AA-2268-PHX-ORD,7:15 a.m.,7:22 a.m.,11:35 a.m.,11:06 a.m. +42,aa,AA-4277-CVG-JFK,12:10 p.m.,12:10 p.m.,2:15 p.m.,1:43 p.m. +43,aa,AA-789-ORD-DEN,1:05 p.m.,1:19 p.m.,2:35 p.m.,3:13 p.m. +44,aa,AA-415-BOS-ORD,8:35 a.m.,8:56 a.m.,10:40 a.m.,10:16 a.m. +45,aa,AA-85-JFK-SFO,3:05 p.m.,3:36 p.m.,6:30 p.m.,6:43 p.m. +46,aa,AA-1544-SAN-ORD,11:25 a.m.,11:40 a.m.,5:25 p.m.,4:56 p.m. +47,aa,AA-3786-IAH-ORD,4:00 p.m.,4:12 p.m.,6:40 p.m.,6:15 p.m. +48,aa,AA-222-LAX-BOS,7:10 a.m.,7:31 a.m.,3:30 p.m.,3:14 p.m. +49,aa,AA-3809-PHX-LAX,6:00 a.m.,6:10 a.m.,6:40 a.m.,6:19 a.m. +50,aa,AA-59-JFK-SFO,7:10 a.m.,7:39 a.m.,10:45 a.m.,11:12 a.m. +51,helloflight,AA-3859-IAH-ORD,,7:16 a.m.,,9:22 a.m. +52,helloflight,CO-1586-IAH-MCO,7:00 p.m.,,10:15 p.m., +53,helloflight,AA-518-MIA-JFK,,6:54 a.m.,,9:14 a.m. +54,helloflight,UA-397-JFK-SFO,,8:48 a.m.,,11:30 a.m. +55,helloflight,UA-3925-IAD-ORF,,9:00 a.m.,,9:29 a.m. +56,helloflight,AA-1733-ORD-PHX,,7:59 p.m.,,10:31 p.m. +57,helloflight,UA-2830-MCO-CLT,,3:50 p.m.,,5:14 p.m. +58,helloflight,CO-58-DEN-IAH,11:25 a.m.,,2:50 p.m., +59,helloflight,AA-1544-SAN-ORD,,11:41 a.m.,,4:54 p.m. +60,helloflight,UA-5487-SFO-MRY,,10:53 a.m.,,11:11 a.m. +61,helloflight,AA-204-LAX-MCO,,11:29 p.m.,,12/2/11 6:35 a.m. +62,helloflight,UA-414-LAX-SFO,,6:10 a.m.,,7:04 a.m. +63,helloflight,CO-1614-LAX-IAH,12:30 a.m.,,5:35 a.m., +64,helloflight,UA-2726-FLL-PHL,,4:10 p.m.,,6:25 p.m. +65,helloflight,CO-52-IAH-LAX,7:35 a.m.,,9:21 a.m., +66,helloflight,CO-1090-BOS-IAH,9:45 a.m.,,1:16 p.m., +67,helloflight,AA-616-DFW-DTW,,10:11 a.m.,,1:11 p.m. +68,helloflight,UA-3515-IAD-MSP,,8:26 a.m.,,9:56 a.m. +69,helloflight,AA-3468-CVG-MIA,,7:25 a.m.,,9:29 a.m. +70,helloflight,AA-484-DFW-MIA,,4:29 p.m.,,7:37 p.m. +71,helloflight,AA-1886-BOS-MIA,,10:54 a.m.,,1:36 p.m. +72,helloflight,CO-62-IAH-EWR,2:30 p.m.,,7:03 p.m., +73,helloflight,CO-1561-PHX-IAH,8:05 a.m.,,11:35 a.m., +74,helloflight,UA-2314-ATL-PHL,,3:04 p.m.,,4:43 p.m. +75,helloflight,AA-446-DFW-PHL,,12:13 p.m.,,4:08 p.m. +76,helloflight,CO-1023-IAH-DEN,11:40 a.m.,,1:17 p.m., +77,helloflight,AA-466-IAH-MIA,,6:09 a.m.,,8:50 a.m. +78,helloflight,UA-3050-PHX-CLT,,11:45 a.m.,,5:00 p.m. +79,helloflight,CO-89-IAH-EWR,6:00 a.m.,,10:20 a.m., +80,helloflight,CO-1088-CLE-IAH,8:40 a.m.,,10:54 a.m., +81,helloflight,AA-1664-MIA-ATL,,10:19 a.m.,,11:50 a.m. +82,helloflight,AA-1279-DFW-PHX,,2:03 p.m.,,3:13 p.m. +83,helloflight,UA-854-SFO-IAH,,8:07 a.m.,,1:23 p.m. +84,helloflight,UA-382-IAD-LAX,,12:56 p.m.,,3:09 p.m. +85,helloflight,AA-4344-ORD-DTW,,12:14 p.m.,,1:55 p.m. +86,helloflight,UA-257-JFK-SFO,,2:43 p.m.,,5:22 p.m. +87,helloflight,UA-2945-PHL-CLT,,12:07 p.m.,,1:11 p.m. +88,helloflight,CO-63-EWR-IAH,5:25 p.m.,,8:22 p.m., +89,helloflight,CO-47-IAH-LAX,7:10 p.m.,,8:56 p.m., +90,helloflight,AA-2050-ORD-MIA,,11:06 a.m.,,2:39 p.m. +91,helloflight,UA-248-PHX-ORD,,1:43 p.m.,,5:24 p.m. +92,helloflight,AA-4330-CVG-ORD,3:35 p.m.,,4:00 p.m., +93,helloflight,AA-1434-DFW-MCO,,7:22 a.m.,,10:34 a.m. +94,helloflight,AA-1640-MIA-MCO,,6:47 p.m.,,7:25 p.m. +95,helloflight,AA-4307-ORD-DTW,,6:56 p.m.,,8:38 p.m. +96,helloflight,AA-3-JFK-LAX,,12:12 p.m.,,3:04 p.m. +97,helloflight,CO-1694-LAX-IAH,7:15 p.m.,,12/2/11 12:21 a.m., +98,helloflight,UA-843-LAX-ORD,,2:04 p.m.,,7:20 p.m. +99,helloflight,CO-1193-EWR-MCO,9:15 a.m.,,12:18 p.m., diff --git a/cxx/assets/hospital_dirty.100.csv b/cxx/assets/hospital_dirty.100.csv new file mode 100644 index 0000000..285c3d2 --- /dev/null +++ b/cxx/assets/hospital_dirty.100.csv @@ -0,0 +1,100 @@ +ProviderNumber,HospitalName,Address1,Address2,Address3,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs caxxed beta bxockers before coming to the hospitax who were kept on the beta bxockers during the period just before and after their surgery,,,al_scip-card-2 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic at the right time (within one hour before surgery) to help prevent infection,,,al_scip-inf-1 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind of antibiotic to help prevent infection,,,al_scip-inf-2 +10018,callahan eye foundation hospital,1720 university blvd,,,birminghxm,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics were stopped at the right time (within 24 hours after surgery),,,al_scip-inf-3 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery,,,al_scip-inf-4 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor),,,al_scip-inf-6 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatments to prevent blood clots after certain types of surgeries,,,al_scip-vte-1 +10018,callahan eye foundation hospital,1720 university blvd,,,birmingxam,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-vte-2,patients who got treatment at the right time (within 24 hours before or after their surgery) to help prevent blood clots after certain types of surgery,,,al_scip-vte-2 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-1,heart attack patients given aspirin at arrival,97%,33 patients,al_ami-1 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-2,heart attack patients given aspirin at discharge,92%,13 patients,al_ami-2 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffxeld,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-3,heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),75%,4 patients,al_ami-3 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-4,heart attack patients given smoking cessation advice/counseling,100%,4 patients,al_ami-4 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-5,heart attack patients given beta blocker at discharge,86%,14 patients,al_ami-5 +1xx19,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-7a,heart attack patients given fibrinolytic medication within 30 minutes of arrival,,0 patients,al_ami-7a +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-8a,heart attack patients given pci within 90 minutes of arrival,,0 patients,al_ami-8a +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-1,heart failure patients given discharge instructions,80%,114 patients,al_hf-1 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acuxe care hospixals,government - hospital district or authority,yes,heart failure,hf-2,heart failure patients given an evaluation of left ventricular systolic (lvs) function,99%,149 patients,al_hf-2 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-3,heart failure patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),92%,3x patients,al_hf-3 +10019,helen keller memorial hospital,1300 south montgomery avenue,,,sheffield,al,35660,jefferson,2563864556,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-4,heart failure patients given smoking cessation advice/counseling,100%,41 patients,al_hf-4 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-1,heart attack patients given aspirin at arrival,98%,270 patients,al_ami-1 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-2,heart attack patients given aspirin at discharge,99%,516 patients,al_ami-2 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart attaxk,ami-3,heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),96%,71 patients,al_ami-3 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospxtal dxstrxct or authorxty,yes,hearx axxack,ami-4,heart attack patients given smoking cessation advice/counseling,100%,244 patients,al_ami-4 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-5,heart attack patients given beta blocker at discharge,99%,441 patients,al_ami-5 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,334793870x,acute care hospitals,government - hospital district or authority,yxs,heart attack,ami-7a,heart attack patients given fibrinolytic medication within 30 minutes of arrival,,0 patients,al_ami-7a +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-8a,heart attack patients given pci within 90 minutes of arrival,72%,40 patients,al_ami-8a +10001,southeast alabama medxcal center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-1,heart failure patients given discharge instructions,78%,455 patients,al_hf-1 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital xistrict or authority,yes,heart failure,hf-2,heart failure patients given an evaluation of left ventricular systolic (lvs) function,99%,514 patients,al_hf-2 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-3,heart failure patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),91%,165 patients,al_hf-3 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-4,heart failure patients given smoking cessation advice/counseling,100%,110 patients,al_hf-4 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-2,pneumonia patients assessed and given pneumococcal vaccination,93%,191 patients,al_pn-2 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,governmenx - hospixal disxricx or auxhorixy,yes,pneumonia,pn-3b,pneumonia patients whose initial emergency room blood culture was performed prior to the administration of the first hospital dose of antibiotics,89%,125 patients,al_pn-3b +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-4,pneumonia patients given smoking cessation advice/counseling,99%,107 patients,al_pn-4 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-5c,pneumonia patients given initial antibiotic(s) within 6 hours after arrival,92%,196 patients,al_pn-5c +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-6,pneumonia patients given the most appropriate initial antibiotic(s),80%,147 patients,al_pn-6 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-7,pneumonia patients assessed and given influenza vaccination,84%,160 patients,al_pn-7 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,xx479x8701,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs called beta blockers before coming to the hospital who were kept on the beta blockers during the period just before and after their surgery,72%,79 patients,al_scip-card-2 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic at the right time (within one hour before surgery) to help prevent infection,94%,399 patients,al_scip-inf-1 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind of antibiotic to help prevent infection,98%,405 patients,al_scip-inf-2 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acxte care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics were stopped at the right time (within 24 hours after surgery),88%,385 patients,al_scip-inf-3 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery,96%,136 patients,al_scip-inf-4 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor),90%,537 patients,al_scip-inf-6 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,xl,36302,houston,3347938701,acute care hospitals,gxvernment - hxspital district xr authxrity,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatments to prevent blood clots after certain types of surgeries,86%,185 patients,al_scip-vte-1 +10001,southeast alabama medical center,1108 ross clark circle,,,dothan,al,36302,houston,3347938701,acute care hospitals,government - hospital district or authority,yex,surgical infection prevention,scip-vte-2,patients who got treatment at the right time (within 24 hours before or after their surgery) to help prevent blood clots after certain types of surgery,83%,184 patients,al_scip-vte-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,x5957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-1,heart attack patients given aspirin at arrival,80%,20 patients,al_ami-1 +x0005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-2,heart attack patients given aspirin at discharge,60%,5 patients,al_ami-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,xl,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-3,heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),100%,2 patients,al_ami-3 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-4,heart attack patients given smoking cessation advice/counseling,,0 patients,al_ami-4 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-5,heart attack patients given beta blocker at discharge,100%,5 patients,al_ami-5 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-7a,heart attack patients given fibrinolytic medication within 30 minutes of arrival,,0 patients,al_ami-7a +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,xl,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart attack,ami-8a,heart attack patients given pci within 90 minutes of arrival,,0 patients,al_ami-8a +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yxs,heart failure,hf-1,heart failure patients given discharge instructions,78%,110 patients,al_hf-1 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-2,heart failure patients given an evaluation of left ventricular systolic (lvs) function,99%,160 patients,al_hf-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-3,heart failure patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),90%,69 patients,al_hf-3 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,heart failure,hf-4,heart failure patients given smoking cessation advice/counseling,94%,34 patients,al_hf-4 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-2,pneumonia patients assessed and given pneumococcal vaccination,97%,172 patients,al_pn-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boxz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-3b,pneumonia patients whose initial emergency room blood culture was performed prior to the administration of the first hospital dose of antibiotics,97%,120 patients,al_pn-xb +10005,marshall medical center south,2505xuxsxhighwayx431xnorth,,,boaz,al,35957,marshall,2x6x938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-4,pneumonia patients given smoking cessation advice/counseling,97%,98 patients,al_pn-4 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-5c,pneumonia patients given initial antibiotic(s) within 6 hours after arrival,96%,211 patients,al_pn-5c +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-6,pneumonia patients given the most appropriate initial antibiotic(s),92%,175 patients,al_pn-6 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2x6x938310,acute care hospitals,government - hospital district or authority,yes,pneumonia,pn-7,pneumonia patients assessed and given influenza vaccination,81%,118 patients,al_pn-7 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs called beta blockers before coming to the hospital who were kept on the beta blockers during the period just before and after their surgery,36%,14 patients,al_scip-card-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic at the right time (within one hour before surgery) to help prevent infection,96%,168 patients,al_scip-inf-1 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,3595x,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind of antibiotic to help prevent infection,93%,167 patients,al_scip-inf-2 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics were stopped at the right time (within 24 hours after surgery),95%,158 patients,al_scip-inf-3 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565x38310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery,,0 patients,al_scip-inf-4 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor),100%,237 patients,al_scip-inf-6 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,mxrshxll,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatments to prevent blood clots after certain types of surgeries,78%,76 patients,al_scip-vte-1 +10005,marshall medical center south,2505 u s highway 431 north,,,boaz,al,35957,marshall,2565938310,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-vte-2,patients who got treatment at the right time (within 24 hours before or after their surgery) to help prevent blood clots after certain types of surgery,74%,76 patients,al_scip-vte-2 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-1,heart attack patients given aspirin at arrival,97%,159 patients,al_amx-1 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,3563x,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-2,heart attack patients given aspirin at discharge,96%,247 patients,al_ami-2 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-3,heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),93%,43 patients,al_ami-3 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,laudxrdalx,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-4,heart attack patients given smoking cessation advice/counseling,100%,103 patients,al_ami-4 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,hearx axxack,ami-5,heart attack patients given beta blocker at discharge,96%,227 patients,al_ami-5 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-7a,heart attack patients given fibrinolytic medication within 30 minutes of arrival,,0 patients,al_ami-7a +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart attack,ami-8a,heart attack patients given pci within 90 minutes of arrival,98%,61 patients,al_ami-8a +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart failure,hf-1,heart failure patients given discharge instructions,79%,217 patients,al_hf-1 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,laudexdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart failure,hf-2,heart failure patients given an evaluation of left ventricular systolic (lvs) function,97%,259 patients,al_hf-2 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart failure,hf-3,heart failure patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),82%,84 patients,al_hf-3 +10006,eliza coffee memorial hospital,205 marengo street,,,fxorence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,heart failure,hf-4,heart failure patients given smoking cessation advice/counseling,100%,53 patients,al_hf-4 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-2,pneumonia patients assessed and given pneumococcal vaccination,90%,174 patients,al_pn-2 +1000x,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-3b,pneumonia patients whose initial emergency room blood culture was performed prior to the administration of the first hospital dose of antibiotics,95%,137 patients,al_pn-3b +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-4,pneumonia patients given smoking cessation advice/counseling,100%,65 patients,al_pn-4 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,xl,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-5c,pneumonia patients given initial antibiotic(s) within 6 hours after arrival,96%,200 patients,al_pn-5c +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-6,pneumonia patients given the most appropriate initial antibiotic(s),91%,138 patients,al_pn-6 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,pneumonia,pn-7,pneumonia patients assessed and given influenza vaccination,88%,85 patients,al_pn-7 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs called beta blockers before coming to the hospital who were kept on the beta blockers during the period just before and after their surgery,96%,107 patients,al_scip-card-2 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic at the right time (within one hour before surgery) to help prevent infection,99%,457 patients,al_scip-inf-1 +10006,eliza coffee memorial hospital,205 marengo street,,,florxncx,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind of antibiotic to help prevent infection,99%,462 patients,al_scip-inf-2 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics were stopped at the right time (within 24 hours after surgery),94%,439 patients,al_scip-inf-3 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery,89%,130 patients,al_scip-inf-4 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor),100%,619 paxienxs,al_scip-inf-6 +10006,eliza coffee memorial hospital,205 marengo street,,,florence,al,35631,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatments to prevent blood clots after certain types of surgeries,95%,242 patients,al_scip-vte-1 +10006,eliza coffee memorial hospital,205 marengo street,,,flxrence,al,3563x,lauderdale,2567688400,acute care hospitals,voluntary non-profit - other,yes,surgical infection prevention,scip-vte-2,patients who got treatment at the right time (within 24 hours before or after their surgery) to help prevent blood clots after certain types of surgery,92%,242 patients,al_scip-vte-2 +10007,mizell memorial hospital,702xnxmainxst,,,opp,al,36467,covington,3344933541,acute care hospitals,voluntary non-profit - private,no,heart attack,ami-1,xeart attack patients given aspirin at arrival,60%,5 patients,al_ami-1 +10007,mizell memorial hospital,702 n main st,,,opp,al,36467,covington,3344933541,acute care hospitals,voluntary non-profit - private,no,heart attack,ami-2,heart attack patients given aspirin at discharge,50%,2 patients,al_ami-2 +10007,mizell memorial hospital,702 n main st,,,opp,al,36467,covington,3344933541,acute care hospitals,voluntary non-profit - private,no,heart attack,ami-3,heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd),,0 patients,al_ami-3 +10007,mizell memorial hospital,702 n main st,,,opp,al,36467,covington,3344933541,acute care hospitals,voluntary non-profit - private,no,heart attack,axi-4,heart attack patients given smoking cessation advice/counseling,,0 patients,al_ami-4 +10007,mizell memorial hospital,702 n main st,,,opp,xl,36467,covington,3344933541,acute care hosxitals,voluntary non-profit - private,no,heart attack,ami-5,heart attack patients given beta blocker at discharge,100%,2 patients,al_ami-5 diff --git a/cxx/assets/rents_dirty.100.csv b/cxx/assets/rents_dirty.100.csv new file mode 100644 index 0000000..78fd3e0 --- /dev/null +++ b/cxx/assets/rents_dirty.100.csv @@ -0,0 +1,100 @@ +Column1,Room Type,Monthly Rent,County,State +0,studio,486.0,Mahoning County,OH +1,4br,2152.0,Clark County,NV +2,1br,1267.0,Gwinnett County,GA +3,3br,1180.0,Granville County,NC +4,,1436.0,Suffolk County,NY +5,2br,1768.0,Miami-Dade County,FL +6,,585.0,Sebastian County,AR +7,studio,599.0,Lapeer County,MI +8,3br,3056.0,Monterey County,CA +9,3br,1193.0,St. Louis County,MN +10,4br,1364.0,Pickaway County,OH +11,2br,1937.0,Los Angeles County, +12,studio,960.0,Buncombe County,NC +13,3br,922.0,Greene County,NC +14,3br,2123.0,Pierce County,WA +15,1br,892.0,Clark County,NV +16,3br,1371.0,Hinds County,MS +17,studio,756.0,Richland County, +18,2br,1140.0,Cowlitz County,WA +19,studio,1180.0,Broward County,FL +20,3br,1148.0,Cuyahoga County,OH +21,studio,641.0,Franklin County,OH +22,,1089.0,Salt Lake County,UT +23,,1452.0,Ketchikan Gateway Borough,AK +24,studio,744.0,Salt Lake County,UT +25,1br,824.0,York County,PA +26,4br,1509.0,Lafayette Parish,LA +27,1br,985.0,Pinellas County,FL +28,4br,1723.0,Pinal County,AZ +29,2br,727.0,Monroe County,MS +30,2br,902.0,Barry County,MI +31,2br,1681.0,Morris County,NJ +32,3br,3181.0,Rockingham Counyt,NH +33,studio,795.0,Smith County,TX +34,,1704.0,Los Angeles County,CA +35,2br,995.0,Marion County,IN +36,2br,1022.0,Allegheny County,PA +37,studio,454.0,Treutlen County,GA +38,studio,1022.0,Broward County, +39,4br,1658.0,Johnson County,KS +40,1br,687.0,Burnett County,WI +41,1br,492.0,Winston County,AL +42,2br,958.0,Polk County,IA +43,3br,3608.0,Orange County, +44,4br,1654.0,Jefferson County,WI +45,studio,1082.0,York County,ME +46,2br,1252.0,Kendall County,IL +47,1br,643.0,Cuyahoga County,OH +48,studio,879.0,Craven County,NC +49,2br,808.0,Rush County,IN +50,,1596.0,Indian River County,FL +51,2br,2328.0,Middlesex County,MA +52,3br,1086.0,Tehama County,CA +53,studio,844.0,Matanuska-Susitna Borough,AK +54,4br,1430.0,Tipton County,TN +55,4br,2084.0,New Haven County, +56,3br,1920.0,Riverside County,CA +57,1br,588.0,Atascosa County,TX +58,4br,2435.0,Maricopa County,AZ +59,1br,1.428,Hudson County, +60,2br,658.0,Miami County,IN +61,3br,2495.0,Essex County,MA +62,4br,1354.0,Harrison County,MS +63,1br,783.0,Wayne County,MI +64,2br,1072.0,Hennepin County,MN +65,4br,2298.0,Strafford Couknty,NH +66,,1690.0,Wayne County,MI +67,1br,887.0,Bexar County,TX +68,3br,1660.0,New Haven County,CT +69,3br,1065.0,Le Flore County,OK +70,1br,1087.0,Riverside County,CA +71,3br,1822.0,Albemarle County,VA +72,3br,1181.0,Allegheny County,PA +73,4br,1030.0,Lenoir County, +74,2br,812.0,Indiana County,PA +75,1br,1019.0,Bexar County, +76,2br,1587.0,Williamson County,TX +77,2br,722.0,Sheboygan County,WI +78,1br,695.0,Stearns County,MN +79,4br,1198.0,St. Louis County,MO +80,4br,1582.0,Knox County,TN +81,4br,1021.0,Bladen County,NC +82,2br,986.0,Marion County,IN +83,3br,1452.0,Rockingham County,VA +84,1br,816.0,Douglas County,NV +85,3br,1257.0,Pulaski County,AR +86,2br,1569.0,Clark County, +87,4br,2020.0,Clark County,NV +88,3br,1181.0,Pitt County, +89,3br,1393.0,Shasta County,CA +90,2br,1905.0,Rockland County,NY +91,studio,640.0,Berks County, +92,3br,1146.0,Richland County, +93,studio,1119.0,Barnstable County,MA +94,4br,1322.0,Accomack County, +95,3br,1330.0,Smith County,TX +96,1br,911.0,Kent County,DE +97,studio,554.0,Macomb County, +98,studio,639.0,Ozaukee County,WI diff --git a/cxx/emissions/bigram_string.cc b/cxx/emissions/bigram_string.cc index 8b48ebd..03b3a3e 100644 --- a/cxx/emissions/bigram_string.cc +++ b/cxx/emissions/bigram_string.cc @@ -109,6 +109,8 @@ double BigramStringEmission::log_prob_distance(const StrAlignment& alignment, do void BigramStringEmission::incorporate( const std::pair& x, double weight) { + printf("In BigramStringEmission::incorporate, clean = %s dirty = %s\n", + x.first.c_str(), x.second.c_str()); N += weight; std::vector alignments; @@ -118,12 +120,14 @@ void BigramStringEmission::incorporate( return log_prob_distance(a, old_cost); }, &alignments); + printf("Debug: found alignments\n"); double total_prob = 0.0; for (auto& a : alignments) { a.cost = exp(a.cost); // Turn all costs into non-log probabilities total_prob += a.cost; } + printf("Debug: total_prob = %f\n", total_prob); for (const auto& a : alignments) { double w = weight * a.cost / total_prob; @@ -170,6 +174,7 @@ void BigramStringEmission::incorporate( } } } + printf("Debug: done with BigramStringEmission::incorporate\n"); } double BigramStringEmission::logp( diff --git a/cxx/emissions/string_alignment.cc b/cxx/emissions/string_alignment.cc index 7d2c90a..f0e686b 100644 --- a/cxx/emissions/string_alignment.cc +++ b/cxx/emissions/string_alignment.cc @@ -21,8 +21,11 @@ void topk_alignments(int k, const std::string& s1, const std::string& s2, std::pop_heap(heap.begin(), heap.end()); heap.pop_back(); + // Does this alignment reach the end of both strings? If so, ship it. double old_cost = -heap_top.cost; + printf("In topk_alignments, heap.size = %ld old_cost = %f\n", + heap.size(), old_cost); if (std::cmp_equal(heap_top.s1_position, s1.length()) && std::cmp_equal(heap_top.s2_position, s2.length())) { heap_top.cost = old_cost; diff --git a/cxx/hirm.cc b/cxx/hirm.cc index aac64e9..70364c9 100644 --- a/cxx/hirm.cc +++ b/cxx/hirm.cc @@ -19,8 +19,11 @@ HIRM::HIRM(const T_schema& schema, std::mt19937* prng) { void HIRM::incorporate(std::mt19937* prng, const std::string& r, const T_items& items, const ObservationVariant& value) { + printf("Debug: in HIRM::incorporate for relation %s\n", r.c_str()); IRM* irm = relation_to_irm(r); + printf("Debug: found irm\n"); irm->incorporate(prng, r, items, value); + printf("Debug: done incorporating into irm.\n"); } void HIRM::unincorporate(const std::string& r, const T_items& items) { diff --git a/cxx/integration_tests.sh b/cxx/integration_tests.sh index db99567..77cc77a 100755 --- a/cxx/integration_tests.sh +++ b/cxx/integration_tests.sh @@ -7,12 +7,12 @@ set -x # Run integration test suite bazel build :hirm pclean:pclean tests:test_hirm_animals tests:test_irm_two_relations tests:test_misc -./bazel-bin/tests/test_hirm_animals -./bazel-bin/tests/test_irm_two_relations -./bazel-bin/tests/test_misc -./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary -./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary -./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary -./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.csv --iters=5 -./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospitals_dirty.csv --iters=5 -./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.csv --iters=5 +#./bazel-bin/tests/test_hirm_animals +#./bazel-bin/tests/test_irm_two_relations +#./bazel-bin/tests/test_misc +#./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary +#./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary +#./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary +#./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 +#./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospital_dirty.100.csv --iters=5 +./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.100.csv --iters=5 diff --git a/cxx/irm.cc b/cxx/irm.cc index 1e864b9..527e53c 100644 --- a/cxx/irm.cc +++ b/cxx/irm.cc @@ -44,6 +44,7 @@ IRM::~IRM() { void IRM::incorporate(std::mt19937* prng, const std::string& r, const T_items& items, ObservationVariant value) { + printf("Debug: in IRM::incorporate for relation %s\n", r.c_str()); std::visit( [&](auto rel) { auto v = std::get< @@ -51,6 +52,7 @@ void IRM::incorporate(std::mt19937* prng, const std::string& r, rel->incorporate(prng, items, v); }, relations.at(r)); + printf("Done: IRM::incorporate for relation %s\n", r.c_str()); } void IRM::unincorporate(const std::string& r, const T_items& items) { diff --git a/cxx/pclean/csv.cc b/cxx/pclean/csv.cc index 049723e..7c6649b 100644 --- a/cxx/pclean/csv.cc +++ b/cxx/pclean/csv.cc @@ -1,6 +1,7 @@ #include "pclean/csv.hh" #include +#include #include #include @@ -45,6 +46,10 @@ DataFrame DataFrame::from_csv( // TODO(thomaswc): Handle quoted fields while (std::getline(ss, part, ',')) { if (first_line && column_names.empty()) { + // Erase white space from end of part + while (isspace(part.back())) { + part.pop_back(); + } col_names.push_back(part); df.data[part] = {}; continue; @@ -53,15 +58,14 @@ DataFrame DataFrame::from_csv( df.data[col_names[i++]].push_back(part); } if (!first_line) { + if (line.back() == ',') { + // std::getline is broken and won't let the last field be empty. + df.data[col_names[i++]].push_back(""); + } if (i != col_names.size()) { - if (line.back() == ',') { - // std::getline is broken and won't let the last field be empty. - df.data[col_names[i++]].push_back(""); - } else { - printf("Only found %ld out of %ld expected columns in line\n%s\n", - i, col_names.size(), line.c_str()); - assert(false); - } + printf("Only found %ld out of %ld expected columns in line\n%s\n", + i, col_names.size(), line.c_str()); + assert(false); } } first_line = false; diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc index d4d16af..f0668aa 100644 --- a/cxx/pclean/pclean.cc +++ b/cxx/pclean/pclean.cc @@ -70,23 +70,16 @@ int main(int argc, char** argv) { std::cout << "Reading observations file from " << obs_fn << "\n"; DataFrame df = DataFrame::from_csv(obs_fn); - // Validate that we have a relation for each observation column. - for (const auto &col : df.data) { - if (!hirm_schema.contains(col.first)) { - printf("Error, could not find HIRM relation for column %s\n", - col.first.c_str()); - assert(false); - } - } - // Create model std::cout << "Creating hirm ...\n"; HIRM hirm(hirm_schema, &prng); // Incorporate observations. - std::cout << "Incorporating observations ...\n"; + std::cout << "Translating observations ...\n"; T_observations observations = translate_observations(df, hirm_schema); + std::cout << "Encoding observations ...\n"; T_encoding encoding = encode_observations(hirm_schema, observations); + std::cout << "Incorporating observations ...\n"; incorporate_observations(&prng, &hirm, encoding, observations); // Run inference diff --git a/cxx/util_io.cc b/cxx/util_io.cc index 032d206..59ecf58 100644 --- a/cxx/util_io.cc +++ b/cxx/util_io.cc @@ -259,6 +259,7 @@ void incorporate_observations_relation( std::unordered_map>& relation_items, std::unordered_set& completed_relations) { + printf("Debug: in incorporate_observations_relation for %s\n", relation.c_str()); RelationVariant rel_var = std::visit([&](auto m) { return m->get_relation(relation); }, h_irm); // base relations must be incorporated before noisy relations, so recursively @@ -285,20 +286,28 @@ void incorporate_observations_relation( completed_relations); } + printf("Debug: done recursively incorporating all base relations.\n"); ObservationVariant ov; if (observations.contains(relation)) { + printf("Debug: in first branch\n"); // If this relation is observed, incorporate its observations. for (const auto& [items, value] : observations.at(relation)) { + printf("Debug: observation has value = %s\n", value.c_str()); std::visit([&](const auto &r) {ov = r->from_string(value); }, rel_var); + printf("Converted it to ObservationVariant\n"); std::visit([&](auto& m) { m->incorporate(prng, relation, items, ov); }, h_irm); + printf("Incorporated it into h_irm\n"); } + printf("\n"); } else { + printf("Debug: in second branch\n"); // If this relation is not observed, incorporate samples from the prior. // This currently assumes a base relation's items are always a prefix of the // noisy relation's items. for (const auto& items : relation_items.at(relation)) { + printf("."); std::visit( [&](auto rel) { using T = typename std::remove_pointer_t< @@ -315,8 +324,10 @@ void incorporate_observations_relation( }, rel_var); } + printf("\n"); } completed_relations.insert(relation); + printf("Debug: finished incorporate_observations_relation for %s\n", relation.c_str()); } void incorporate_observations(std::mt19937* prng, @@ -349,6 +360,7 @@ void incorporate_observations(std::mt19937* prng, } } + printf("Debug: done incorporating observations\n"); std::unordered_map noisy_to_base; std::unordered_map> base_to_noisy = std::visit([](const auto& m) { return m->base_to_noisy_relations; }, @@ -364,6 +376,8 @@ void incorporate_observations(std::mt19937* prng, } } + printf("Debug: done computing noisy_to_base\n"); + std::unordered_set completed_relations; for (const std::string& relation : observed_relations) { if (!completed_relations.contains(relation)) { @@ -372,6 +386,9 @@ void incorporate_observations(std::mt19937* prng, relation_items, completed_relations); } } + + printf("Debug: done incorporate_observations_relation \n"); + } void to_txt(std::ostream& fp, const IRM& irm, const T_encoding& encoding) { From f0c4c0c5e75b72952dc3c310574608f2340ba3f3 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 13:32:51 +0000 Subject: [PATCH 08/14] Fix string alignment bug --- cxx/emissions/string_alignment.cc | 14 ++++++++++++-- cxx/integration_tests.sh | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cxx/emissions/string_alignment.cc b/cxx/emissions/string_alignment.cc index f0e686b..ca64b51 100644 --- a/cxx/emissions/string_alignment.cc +++ b/cxx/emissions/string_alignment.cc @@ -1,3 +1,4 @@ +#include #include #include "emissions/string_alignment.hh" @@ -5,6 +6,7 @@ void topk_alignments(int k, const std::string& s1, const std::string& s2, CostFunction cost_function, std::vector* alignments) { std::vector heap; + std::set> visited; StrAlignment empty_alignment; empty_alignment.cost = -0.0; // We negate all costs on the heap so that // the front of the heap is the min cost @@ -21,11 +23,19 @@ void topk_alignments(int k, const std::string& s1, const std::string& s2, std::pop_heap(heap.begin(), heap.end()); heap.pop_back(); + std::pair here = std::make_pair( + heap_top.s1_position, heap_top.s2_position); + if (visited.contains(here)) { + continue; + } + visited.insert(here); // Does this alignment reach the end of both strings? If so, ship it. double old_cost = -heap_top.cost; - printf("In topk_alignments, heap.size = %ld old_cost = %f\n", - heap.size(), old_cost); + /* + printf("In topk_alignments, heap.size = %ld old_cost = %f s1 pos = %ld s2 pos = %ld\n", + heap.size(), old_cost, heap_top.s1_position, heap_top.s2_position); + */ if (std::cmp_equal(heap_top.s1_position, s1.length()) && std::cmp_equal(heap_top.s2_position, s2.length())) { heap_top.cost = old_cost; diff --git a/cxx/integration_tests.sh b/cxx/integration_tests.sh index 77cc77a..aa70850 100755 --- a/cxx/integration_tests.sh +++ b/cxx/integration_tests.sh @@ -13,6 +13,6 @@ bazel build :hirm pclean:pclean tests:test_hirm_animals tests:test_irm_two_relat #./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary #./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary #./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary -#./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 -#./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospital_dirty.100.csv --iters=5 +./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 +./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospital_dirty.100.csv --iters=5 ./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.100.csv --iters=5 From ecd74e598526390c64c34eac27cb612a914b47c6 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 14:56:25 +0000 Subject: [PATCH 09/14] Remove debugging printfs --- cxx/emissions/bigram_string.cc | 5 ----- cxx/emissions/string_alignment.cc | 13 ------------- cxx/hirm.cc | 3 --- cxx/integration_tests.sh | 12 ++++++------ cxx/irm.cc | 2 -- cxx/util_io.cc | 17 ----------------- 6 files changed, 6 insertions(+), 46 deletions(-) diff --git a/cxx/emissions/bigram_string.cc b/cxx/emissions/bigram_string.cc index 03b3a3e..8b48ebd 100644 --- a/cxx/emissions/bigram_string.cc +++ b/cxx/emissions/bigram_string.cc @@ -109,8 +109,6 @@ double BigramStringEmission::log_prob_distance(const StrAlignment& alignment, do void BigramStringEmission::incorporate( const std::pair& x, double weight) { - printf("In BigramStringEmission::incorporate, clean = %s dirty = %s\n", - x.first.c_str(), x.second.c_str()); N += weight; std::vector alignments; @@ -120,14 +118,12 @@ void BigramStringEmission::incorporate( return log_prob_distance(a, old_cost); }, &alignments); - printf("Debug: found alignments\n"); double total_prob = 0.0; for (auto& a : alignments) { a.cost = exp(a.cost); // Turn all costs into non-log probabilities total_prob += a.cost; } - printf("Debug: total_prob = %f\n", total_prob); for (const auto& a : alignments) { double w = weight * a.cost / total_prob; @@ -174,7 +170,6 @@ void BigramStringEmission::incorporate( } } } - printf("Debug: done with BigramStringEmission::incorporate\n"); } double BigramStringEmission::logp( diff --git a/cxx/emissions/string_alignment.cc b/cxx/emissions/string_alignment.cc index ca64b51..7d2c90a 100644 --- a/cxx/emissions/string_alignment.cc +++ b/cxx/emissions/string_alignment.cc @@ -1,4 +1,3 @@ -#include #include #include "emissions/string_alignment.hh" @@ -6,7 +5,6 @@ void topk_alignments(int k, const std::string& s1, const std::string& s2, CostFunction cost_function, std::vector* alignments) { std::vector heap; - std::set> visited; StrAlignment empty_alignment; empty_alignment.cost = -0.0; // We negate all costs on the heap so that // the front of the heap is the min cost @@ -23,19 +21,8 @@ void topk_alignments(int k, const std::string& s1, const std::string& s2, std::pop_heap(heap.begin(), heap.end()); heap.pop_back(); - std::pair here = std::make_pair( - heap_top.s1_position, heap_top.s2_position); - if (visited.contains(here)) { - continue; - } - visited.insert(here); - // Does this alignment reach the end of both strings? If so, ship it. double old_cost = -heap_top.cost; - /* - printf("In topk_alignments, heap.size = %ld old_cost = %f s1 pos = %ld s2 pos = %ld\n", - heap.size(), old_cost, heap_top.s1_position, heap_top.s2_position); - */ if (std::cmp_equal(heap_top.s1_position, s1.length()) && std::cmp_equal(heap_top.s2_position, s2.length())) { heap_top.cost = old_cost; diff --git a/cxx/hirm.cc b/cxx/hirm.cc index 70364c9..aac64e9 100644 --- a/cxx/hirm.cc +++ b/cxx/hirm.cc @@ -19,11 +19,8 @@ HIRM::HIRM(const T_schema& schema, std::mt19937* prng) { void HIRM::incorporate(std::mt19937* prng, const std::string& r, const T_items& items, const ObservationVariant& value) { - printf("Debug: in HIRM::incorporate for relation %s\n", r.c_str()); IRM* irm = relation_to_irm(r); - printf("Debug: found irm\n"); irm->incorporate(prng, r, items, value); - printf("Debug: done incorporating into irm.\n"); } void HIRM::unincorporate(const std::string& r, const T_items& items) { diff --git a/cxx/integration_tests.sh b/cxx/integration_tests.sh index aa70850..499550e 100755 --- a/cxx/integration_tests.sh +++ b/cxx/integration_tests.sh @@ -7,12 +7,12 @@ set -x # Run integration test suite bazel build :hirm pclean:pclean tests:test_hirm_animals tests:test_irm_two_relations tests:test_misc -#./bazel-bin/tests/test_hirm_animals -#./bazel-bin/tests/test_irm_two_relations -#./bazel-bin/tests/test_misc -#./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary -#./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary -#./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary +./bazel-bin/tests/test_hirm_animals +./bazel-bin/tests/test_irm_two_relations +./bazel-bin/tests/test_misc +./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary +./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary +./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary ./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 ./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospital_dirty.100.csv --iters=5 ./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.100.csv --iters=5 diff --git a/cxx/irm.cc b/cxx/irm.cc index 527e53c..1e864b9 100644 --- a/cxx/irm.cc +++ b/cxx/irm.cc @@ -44,7 +44,6 @@ IRM::~IRM() { void IRM::incorporate(std::mt19937* prng, const std::string& r, const T_items& items, ObservationVariant value) { - printf("Debug: in IRM::incorporate for relation %s\n", r.c_str()); std::visit( [&](auto rel) { auto v = std::get< @@ -52,7 +51,6 @@ void IRM::incorporate(std::mt19937* prng, const std::string& r, rel->incorporate(prng, items, v); }, relations.at(r)); - printf("Done: IRM::incorporate for relation %s\n", r.c_str()); } void IRM::unincorporate(const std::string& r, const T_items& items) { diff --git a/cxx/util_io.cc b/cxx/util_io.cc index 59ecf58..032d206 100644 --- a/cxx/util_io.cc +++ b/cxx/util_io.cc @@ -259,7 +259,6 @@ void incorporate_observations_relation( std::unordered_map>& relation_items, std::unordered_set& completed_relations) { - printf("Debug: in incorporate_observations_relation for %s\n", relation.c_str()); RelationVariant rel_var = std::visit([&](auto m) { return m->get_relation(relation); }, h_irm); // base relations must be incorporated before noisy relations, so recursively @@ -286,28 +285,20 @@ void incorporate_observations_relation( completed_relations); } - printf("Debug: done recursively incorporating all base relations.\n"); ObservationVariant ov; if (observations.contains(relation)) { - printf("Debug: in first branch\n"); // If this relation is observed, incorporate its observations. for (const auto& [items, value] : observations.at(relation)) { - printf("Debug: observation has value = %s\n", value.c_str()); std::visit([&](const auto &r) {ov = r->from_string(value); }, rel_var); - printf("Converted it to ObservationVariant\n"); std::visit([&](auto& m) { m->incorporate(prng, relation, items, ov); }, h_irm); - printf("Incorporated it into h_irm\n"); } - printf("\n"); } else { - printf("Debug: in second branch\n"); // If this relation is not observed, incorporate samples from the prior. // This currently assumes a base relation's items are always a prefix of the // noisy relation's items. for (const auto& items : relation_items.at(relation)) { - printf("."); std::visit( [&](auto rel) { using T = typename std::remove_pointer_t< @@ -324,10 +315,8 @@ void incorporate_observations_relation( }, rel_var); } - printf("\n"); } completed_relations.insert(relation); - printf("Debug: finished incorporate_observations_relation for %s\n", relation.c_str()); } void incorporate_observations(std::mt19937* prng, @@ -360,7 +349,6 @@ void incorporate_observations(std::mt19937* prng, } } - printf("Debug: done incorporating observations\n"); std::unordered_map noisy_to_base; std::unordered_map> base_to_noisy = std::visit([](const auto& m) { return m->base_to_noisy_relations; }, @@ -376,8 +364,6 @@ void incorporate_observations(std::mt19937* prng, } } - printf("Debug: done computing noisy_to_base\n"); - std::unordered_set completed_relations; for (const std::string& relation : observed_relations) { if (!completed_relations.contains(relation)) { @@ -386,9 +372,6 @@ void incorporate_observations(std::mt19937* prng, relation_items, completed_relations); } } - - printf("Debug: done incorporate_observations_relation \n"); - } void to_txt(std::ostream& fp, const IRM& irm, const T_encoding& encoding) { From b6e508e9414fce809d79400c73c3f58da46d41e3 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 16:39:06 +0000 Subject: [PATCH 10/14] Fix CountyName in hospitals.schema and improve error message in util_io --- cxx/assets/hospitals.schema | 2 +- cxx/integration_tests.sh | 4 ++-- cxx/util_io.cc | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cxx/assets/hospitals.schema b/cxx/assets/hospitals.schema index 6439c0b..e6a0c38 100644 --- a/cxx/assets/hospitals.schema +++ b/cxx/assets/hospitals.schema @@ -43,7 +43,7 @@ observe hosp.loc.city as City hosp.loc.county.state as State hosp.zip as ZipCode - hosp.loc.county.county as County + hosp.loc.county.county as CountyName hosp.phone as PhoneNumber hosp.type.desc as HospitalType hosp.owner as HospitalOwner diff --git a/cxx/integration_tests.sh b/cxx/integration_tests.sh index 499550e..34ca83a 100755 --- a/cxx/integration_tests.sh +++ b/cxx/integration_tests.sh @@ -13,6 +13,6 @@ bazel build :hirm pclean:pclean tests:test_hirm_animals tests:test_irm_two_relat ./bazel-bin/hirm --mode=irm --iters=5 assets/animals.binary ./bazel-bin/hirm --seed=1 --iters=5 assets/animals.unary ./bazel-bin/hirm --iters=5 --load=assets/animals.unary.1.hirm assets/animals.unary -./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 +#./bazel-bin/pclean/pclean --schema=assets/flights.schema --obs=assets/flights_dirty.100.csv --iters=5 ./bazel-bin/pclean/pclean --schema=assets/hospitals.schema --obs=assets/hospital_dirty.100.csv --iters=5 -./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.100.csv --iters=5 +#./bazel-bin/pclean/pclean --schema=assets/rents.schema --obs=assets/rents_dirty.100.csv --iters=5 diff --git a/cxx/util_io.cc b/cxx/util_io.cc index 032d206..2be46c1 100644 --- a/cxx/util_io.cc +++ b/cxx/util_io.cc @@ -356,9 +356,10 @@ void incorporate_observations(std::mt19937* prng, for (const auto& [base_name, noisy_names] : base_to_noisy) { for (const std::string& noisy_name : noisy_names) { if (!base_to_noisy.contains(noisy_name)) { - assert(observations.contains(noisy_name) && - "A relation that is not the base of a noisy relation must be " - "observed."); + if (!observations.contains(noisy_name)) { + printf("Relation %s has no observations and is not the base of a noisy relation.\n", noisy_name.c_str()); + assert(false); + } } noisy_to_base[noisy_name] = base_name; } From 116e4068145d37ac417a281cc89c1561bc9ed85b Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 17:53:32 +0000 Subject: [PATCH 11/14] Mark observable relations in test --- cxx/pclean/pclean_lib_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cxx/pclean/pclean_lib_test.cc b/cxx/pclean/pclean_lib_test.cc index 8a61073..083d64c 100644 --- a/cxx/pclean/pclean_lib_test.cc +++ b/cxx/pclean/pclean_lib_test.cc @@ -23,13 +23,13 @@ BOOST_AUTO_TEST_CASE(test_translate_observations) { {"County:state", T_clean_relation{{"dCounty"}, false, DistributionSpec("stringcat", state_params)}}, {"Room Type", - T_clean_relation{{"dObs"}, false, DistributionSpec("stringcat", br_params)}}, + T_clean_relation{{"dObs"}, true, DistributionSpec("stringcat", br_params)}}, {"Monthly Rent", - T_clean_relation{{"dObs"}, false, DistributionSpec("normal")}}, + T_clean_relation{{"dObs"}, true, DistributionSpec("normal")}}, {"County", - T_noisy_relation{{"dCounty", "dObs"}, false, EmissionSpec("bigram"), "County:name"}}, + T_noisy_relation{{"dCounty", "dObs"}, true, EmissionSpec("bigram"), "County:name"}}, {"State", - T_noisy_relation{{"dCounty", "dObs"}, false, EmissionSpec("bigram"), "County:state"}}}; + T_noisy_relation{{"dCounty", "dObs"}, true, EmissionSpec("bigram"), "County:state"}}}; T_observations obs = translate_observations(df, schema); From fcfa603474e73cbd942c5ec5450b957e0f8f966d Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 18:57:55 +0000 Subject: [PATCH 12/14] Mark query-based relations as observed --- cxx/pclean/schema_helper.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc index b85b7dc..6d5cfaa 100644 --- a/cxx/pclean/schema_helper.cc +++ b/cxx/pclean/schema_helper.cc @@ -122,8 +122,10 @@ T_schema PCleanSchemaHelper::make_hirm_schema() { domains[query_class.name], annotated_domains[query_class.name], path_prefix); - tschema[f.name] = get_emission_relation( + T_noisy_relation tnr = get_emission_relation( std::get(sv.spec), reordered_domains, base_relation); + tnr.is_observed = true; + tschema[f.name] = tnr; } return tschema; From ab02dbc027e16853a64c9a1c3c8dc25c88ae35d6 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 19:52:31 +0000 Subject: [PATCH 13/14] Fix schema_helper_test.cc --- cxx/pclean/schema_helper_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc index da524d0..b85550a 100644 --- a/cxx/pclean/schema_helper_test.cc +++ b/cxx/pclean/schema_helper_test.cc @@ -172,7 +172,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("Specialty")); T_noisy_relation nr1 = std::get(tschema["Specialty"]); - BOOST_TEST(!nr1.is_observed); + BOOST_TEST(nr1.is_observed); BOOST_TEST((nr1.emission_spec.emission == EmissionEnum::bigram_string)); // "School", "Physician" moved to the front of the list. expected_domains = {"School", "Physician", "City", "Practice", "Record"}; @@ -180,7 +180,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("School")); T_noisy_relation nr2 = std::get(tschema["School"]); - BOOST_TEST(!nr2.is_observed); + BOOST_TEST(nr2.is_observed); BOOST_TEST((nr2.emission_spec.emission == EmissionEnum::bigram_string)); // "School" moved to the front of the list. expected_domains = {"School", "Physician", "City", "Practice", "Record"}; @@ -188,7 +188,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("Degree")); T_noisy_relation nr3 = std::get(tschema["Degree"]); - BOOST_TEST(!nr3.is_observed); + BOOST_TEST(nr3.is_observed); BOOST_TEST((nr3.emission_spec.emission == EmissionEnum::bigram_string)); // "School", "Physician" moved to the front of the list. expected_domains = {"School", "Physician", "City", "Practice", "Record"}; @@ -196,7 +196,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("City")); T_noisy_relation nr4 = std::get(tschema["City"]); - BOOST_TEST(!nr4.is_observed); + BOOST_TEST(nr4.is_observed); BOOST_TEST((nr4.emission_spec.emission == EmissionEnum::bigram_string)); // "City" moved to the front of the list. expected_domains = {"City", "School", "Physician", "Practice", "Record"}; @@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schmea) { BOOST_TEST(tschema.contains("State")); T_noisy_relation nr5 = std::get(tschema["State"]); - BOOST_TEST(!nr5.is_observed); + BOOST_TEST(nr5.is_observed); BOOST_TEST((nr5.emission_spec.emission == EmissionEnum::bigram_string)); // "City" moved to the front of the list. expected_domains = {"City", "School", "Physician", "Practice", "Record"}; From fb10d29dfef56f775eac332819528fa1468a1320 Mon Sep 17 00:00:00 2001 From: Thomas Colthurst Date: Wed, 7 Aug 2024 20:23:24 +0000 Subject: [PATCH 14/14] Add TODO requested by reviewer --- cxx/pclean/pclean_lib.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc index 59d40fa..cb5c6b1 100644 --- a/cxx/pclean/pclean_lib.cc +++ b/cxx/pclean/pclean_lib.cc @@ -34,6 +34,9 @@ T_observations translate_observations( // Give every row it's own universe of unique id's. // TODO(thomaswc): Correctly handle the case when a row makes // references to two or more different entities of the same type. + // TODO(thomaswc): Discuss other options for handling this, such + // as sampling the non-index domains from a CRP prior or specifying + // additional CSV columns to use as foreign keys. entities.push_back(std::to_string(i)); } obs[col_name].push_back(std::make_tuple(entities, val));