probcomp · ThomasColthurst · Aug 8, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/cxx/pclean/BUILD b/cxx/pclean/BUILD
@@ -59,12 +59,35 @@ cc_binary(
     name = "pclean",
     srcs = ["pclean.cc"],
     deps = [
+        ":csv",
         ":io",
+        ":pclean_lib",
         ":schema",
         ":schema_helper",
         "//:cxxopts",
         "//:hirm_lib",
         "//:inference",
+        "//:util_io",
+    ],
+)
+
+cc_library(
+    name = "pclean_lib",
+    hdrs = ["pclean_lib.hh"],
+    srcs = ["pclean_lib.cc"],
+    deps = [
+        ":csv",
+        "//:hirm_lib",
+        "//:util_io",
+    ],
+)
+
+cc_test(
+    name = "pclean_lib_test",
+    srcs = ["pclean_lib_test.cc"],
+    deps = [
+        ":pclean_lib",
+        "@boost//:test",
     ],
 )
 

diff --git a/cxx/pclean/csv.cc b/cxx/pclean/csv.cc
@@ -53,7 +53,16 @@ DataFrame DataFrame::from_csv(
       df.data[col_names[i++]].push_back(part);
     }
     if (!first_line) {
-      assert(i == col_names.size());
+      if (i != col_names.size()) {
+        if (line.back() == ',') {
+          // std::getline is broken and won't let the last field be empty.
+          df.data[col_names[i++]].push_back("");
+        } else {
+          printf("Only found %ld out of %ld expected columns in line\n%s\n",
+                 i, col_names.size(), line.c_str());
+          assert(false);
+        }
+      }
     }
     first_line = false;
   }

diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc
@@ -8,9 +8,13 @@
 #include <random>
 
 #include "cxxopts.hpp"
+#include "irm.hh"
 #include "hirm.hh"
 #include "inference.hh"
+#include "util_io.hh"
+#include "pclean/csv.hh"
 #include "pclean/io.hh"
+#include "pclean/pclean_lib.hh"
 #include "pclean/schema.hh"
 #include "pclean/schema_helper.hh"
 
@@ -49,6 +53,7 @@ int main(int argc, char** argv) {
   std::cout << "Reading plcean schema ...\n";
   PCleanSchema pclean_schema;
   std::string schema_fn = result["schema"].as<std::string>();
+  std::cout << "Reading schema file from " << schema_fn << "\n";
   if (!read_schema_file(schema_fn, &pclean_schema)) {
     std::cout << "Error reading schema file" << schema_fn << "\n";
   }
@@ -62,16 +67,30 @@ int main(int argc, char** argv) {
   // Read observations
   std::cout << "Reading observations ...\n";
   std::string obs_fn = result["obs"].as<std::string>();
-  // TODO(thomaswc): This
+  std::cout << "Reading observations file from " << obs_fn << "\n";
+  DataFrame df = DataFrame::from_csv(obs_fn);
+
+  // Validate that we have a relation for each observation column.
+  for (const auto &col : df.data) {
+    if (!hirm_schema.contains(col.first)) {
+      printf("Error, could not find HIRM relation for column %s\n",
+             col.first.c_str());
+      assert(false);
+    }
+  }
 
   // Create model
   std::cout << "Creating hirm ...\n";
   HIRM hirm(hirm_schema, &prng);
 
   // Incorporate observations.
-  // TODO(thomaswc): This
+  std::cout << "Incorporating observations ...\n";
+  T_observations observations = translate_observations(df, hirm_schema);
+  T_encoding encoding = encode_observations(hirm_schema, observations);
+  incorporate_observations(&prng, &hirm, encoding, observations);
 
   // Run inference
+  std::cout << "Running inference ...\n";
   inference_hirm(&prng, &hirm,
                  result["iters"].as<int>(),
                  result["timeout"].as<int>(),

diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc
@@ -0,0 +1,43 @@
+// Copyright 2024
+// Apache License, Version 2.0, refer to LICENSE.txt
+
+#include "irm.hh"
+#include "pclean/csv.hh"
+#include "pclean/pclean_lib.hh"
+
+T_observations translate_observations(
+    const DataFrame& df, const T_schema &schema) {
+  T_observations obs;
+
+  for (const auto& col : df.data) {
+    const std::string& col_name = col.first;
+    if (!schema.contains(col_name)) {
+      printf("Schema does not contain %s, skipping ...\n", col_name.c_str());
+      continue;
+    }
+
+    const T_relation& trel = schema.at(col_name);
+    size_t num_domains = std::visit([&](const auto &r) {
+      return r.domains.size();}, trel);
+
+    for (size_t i = 0; i < col.second.size(); ++i) {
+      const std::string& val = col.second[i];
+      if (val.empty()) {
+        // Don't incorporate missing values.
+        // TODO(thomaswc): Allow the user to specify other values that mean
+        // missing data.  ("missing", "NA", "nan", etc.).
+        continue;
+      }
+
+      std::vector<std::string> entities;
+      for (size_t j = 0; j < num_domains; ++j) {
+        // Give every row it's own universe of unique id's.
+        // TODO(thomaswc): Correctly handle the case when a row makes
+        // references to two or more different entities of the same type.
+        entities.push_back(std::to_string(i));
+      }
+      obs[col_name].push_back(std::make_tuple(entities, val));
+    }
+  }
+  return obs;
+}
diff --git a/cxx/pclean/pclean_lib.hh b/cxx/pclean/pclean_lib.hh
@@ -0,0 +1,16 @@
+// Copyright 2024
+// Apache License, Version 2.0, refer to LICENSE.txt
+
+#pragma once
+
+#include "irm.hh"
+#include "util_io.hh"
+#include "pclean/csv.hh"
+#include "pclean/pclean_lib.hh"
+
+// For each non-missing value in the DataFrame df, create an
+// observation in the returned T_observations.  The column name of the value
+// is used as the relation name, and each entity in each domain is given
+// its own unique value.
+T_observations translate_observations(
+    const DataFrame& df, const T_schema &schema);
diff --git a/cxx/pclean/pclean_lib_test.cc b/cxx/pclean/pclean_lib_test.cc
@@ -0,0 +1,57 @@
+#define BOOST_TEST_MODULE test pclean_csv
+
+#include "pclean/pclean_lib.hh"
+#include <sstream>
+#include <boost/test/included/unit_test.hpp>
+namespace tt = boost::test_tools;
+
+BOOST_AUTO_TEST_CASE(test_translate_observations) {
+  std::stringstream ss(R"""(Column1,Room Type,Monthly Rent,County,State
+0,studio,,Mahoning County,OH
+1,4br,2152.0,,NV
+2,1br,1267.0,Gwinnett County,
+)""");
+
+  DataFrame df = DataFrame::from_csv(ss);
+
+  std::map<std::string, std::string> state_params = {{"strings", "AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY"}};
+  std::map<std::string, std::string> br_params = {{"strings", "1br 2br 3br 4br studio"}};
+
+  T_schema schema = {
+    {"County:name",
+      T_clean_relation{{"County"}, false, DistributionSpec("bigram")}},
+    {"County:state",
+      T_clean_relation{{"County"}, false, DistributionSpec("stringcat", state_params)}},
+    {"Room Type",
+      T_clean_relation{{"Obs"}, false, DistributionSpec("stringcat", br_params)}},
+    {"Monthly Rent",
+      T_clean_relation{{"Obs"}, false, DistributionSpec("normal")}},
+    {"County",
+      T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:name"}},
+    {"State",
+      T_noisy_relation{{"County", "Obs"}, false, EmissionSpec("bigram"), "County:state"}}};
+
+  T_observations obs = translate_observations(df, schema);
+
+  // Relations not corresponding to columns should be un-observed.
+  BOOST_TEST(!obs.contains("County:name"));
+  BOOST_TEST(!obs.contains("County:state"));
+
+  BOOST_TEST(obs["Room Type"].size() == 3);
+  BOOST_TEST(obs["Monthly Rent"].size() == 2);
+  BOOST_TEST(obs["County"].size() == 2);
+  BOOST_TEST(obs["State"].size() == 2);
+
+  BOOST_TEST(std::get<0>(obs["Room Type"][0]).size() == 1);
+  BOOST_TEST(std::get<1>(obs["Room Type"][0]) == "studio");
+
+  BOOST_TEST(std::get<0>(obs["Monthly Rent"][0]).size() == 1);
+  BOOST_TEST(std::get<1>(obs["Monthly Rent"][0]) == "2152.0");
+
+  BOOST_TEST(std::get<0>(obs["County"][0]).size() == 2);
+  BOOST_TEST(std::get<1>(obs["County"][0]) == "Mahoning County");
+
+  BOOST_TEST(std::get<0>(obs["State"][0]).size() == 2);
+  BOOST_TEST(std::get<1>(obs["State"][0]) == "OH");
+}
+