probcomp · ThomasColthurst · Aug 8, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/cxx/pclean/BUILD b/cxx/pclean/BUILD
@@ -59,12 +59,14 @@ cc_binary(
     name = "pclean",
     srcs = ["pclean.cc"],
     deps = [
+        ":csv",
         ":io",
         ":schema",
         ":schema_helper",
         "//:cxxopts",
         "//:hirm_lib",
         "//:inference",
+        "//:util_io",
     ],
 )
 

diff --git a/cxx/pclean/pclean.cc b/cxx/pclean/pclean.cc
@@ -8,12 +8,49 @@
 #include <random>
 
 #include "cxxopts.hpp"
+#include "irm.hh"
 #include "hirm.hh"
 #include "inference.hh"
+#include "util_io.hh"
+#include "pclean/csv.hh"
 #include "pclean/io.hh"
 #include "pclean/schema.hh"
 #include "pclean/schema_helper.hh"
 
+T_observations translate_observations(
+    const DataFrame& df, const T_schema &schema) {
+  T_observations obs;
+
+  for (const auto& col : df.data) {
+    const std::string& col_name = col.first;
+    const T_relation& trel = schema.at(col_name);
+    size_t num_domains;
+    std::visit([&](const auto &r) {
+      num_domains = r.domains.size();
+    }, trel);
+
+    for (size_t i = 0; i < col.second.size(); ++i) {
+      const std::string& val = col.second[i];
+      if (val.empty()) {
+        // Don't incorporate missing values.
+        // TODO(thomaswc): Allow the user to specify other values that mean
+        // missing data.  ("missing", "NA", "nan", etc.).
+        continue;
+      }
+
+      std::vector<std::string> entities;
+      for (size_t j = 0; j < num_domains; ++j) {
+        // Assume that each row of the dataframe is its own entity, *and*
+        // that all of its ancestor entities are distinct from those of any
+        // other entity.
+        entities.push_back(std::to_string(i));
+      }
+      obs[col_name].push_back(std::make_tuple(entities, val));
+    }
+  }
+  return obs;
+}
+
 int main(int argc, char** argv) {
   cxxopts::Options options(
       "pclean", "Run HIRM from a PClean schema");
@@ -48,25 +85,41 @@ int main(int argc, char** argv) {
   // Read schema
   PCleanSchema pclean_schema;
   std::string schema_fn = result["schema"].as<std::string>();
+  std::cout << "Reading schema file from " << schema_fn << "\n";
   if (!read_schema_file(schema_fn, &pclean_schema)) {
     std::cout << "Error reading schema file" << schema_fn << "\n";
   }
 
   // Translate schema
+  std::cout << "Translating schema ...\n";
   PCleanSchemaHelper schema_helper(pclean_schema);
   T_schema hirm_schema = schema_helper.make_hirm_schema();
 
   // Read observations
   std::string obs_fn = result["obs"].as<std::string>();
-  // TODO(thomaswc): This
+  std::cout << "Reading observations file from " << obs_fn << "\n";
+  DataFrame df = DataFrame::from_csv(obs_fn);
+
+  // Validate that we have a relation for each observation column.
+  for (const auto &col : df.data) {
+    if (!hirm_schema.contains(col.first)) {
+      printf("Error, could not find HIRM relation for column %s\n",
+             col.first.c_str());
+      assert(false);
+    }
+  }
 
   // Create model
   HIRM hirm(hirm_schema, &prng);
 
   // Incorporate observations.
-  // TODO(thomaswc): This
+  std::cout << "Incorporating observations ...\n";
+  T_observations observations = translate_observations(df, hirm_schema);
+  T_encoding encoding = encode_observations(hirm_schema, observations);
+  incorporate_observations(&prng, &hirm, encoding, observations);
 
   // Run inference
+  std::cout << "Running inference ...\n";
   inference_hirm(&prng, &hirm,
                  result["iters"].as<int>(),
                  result["timeout"].as<int>(),