diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml new file mode 100644 index 0000000..c57bffe --- /dev/null +++ b/.github/workflows/continuous-integration.yml @@ -0,0 +1,24 @@ +# This workflow will run tests. + +name: Tests +on: + push: + branches: [ main ] + pull_request: +jobs: + run_tests: + name: Run tests + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checking out repository + uses: actions/checkout@v4 + - name: Install dependencies + run : | + sudo apt-get update && sudo apt-get install -yq clang clang-format + - name: Build everything + run: | + cd cxx && bazel build ... + - name: Test C++ + run: | + cd cxx && bazel test --test_output=errors ... diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml deleted file mode 100644 index 3c16fc2..0000000 --- a/.github/workflows/python-package.yml +++ /dev/null @@ -1,30 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Python package - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.8] - - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Test Python and C++ - run: | - python -m pip install --upgrade pip - python -m pip install . - ./check.sh diff --git a/check.sh b/check.sh index c00d795..b32ae87 100755 --- a/check.sh +++ b/check.sh @@ -17,7 +17,7 @@ root=$(cd -- "$(dirname -- "$0")" && pwd) if [ $# -eq 0 ]; then # (Default) Run tests/ ./pythenv.sh "$PYTHON" -m pytest --pyargs hirm - cd cxx && make tests + cd cxx && bazel test :all elif [ ${1} = 'coverage' ]; then # Generate coverage report. ./pythenv.sh coverage run --source=build/ -m pytest --pyargs hirm diff --git a/cxx/.clang-format b/cxx/.clang-format new file mode 100644 index 0000000..cdf4a3b --- /dev/null +++ b/cxx/.clang-format @@ -0,0 +1,8 @@ +Language: Cpp +BasedOnStyle: Google +DerivePointerAlignment: false +PointerAlignment: Left +BreakConstructorInitializers: BeforeColon +ConstructorInitializerIndentWidth: 4 +PackConstructorInitializers: NextLine +SpaceBeforeCtorInitializerColon: true diff --git a/cxx/distributions/adapter.hh b/cxx/distributions/adapter.hh index df6f591..db385ee 100644 --- a/cxx/distributions/adapter.hh +++ b/cxx/distributions/adapter.hh @@ -49,14 +49,12 @@ class DistributionAdapter : Distribution { double logp_score() const { return d->logp_score(); } - std::string sample() { - SampleType s = d->sample(); - return to_string(s); - } - - void transition_hyperparameters() { - d->transition_hyperparameters(); - } + std::string sample() { + SampleType s = d->sample(); + return to_string(s); + } + + void transition_hyperparameters() { d->transition_hyperparameters(); } ~DistributionAdapter() { delete d; } }; diff --git a/cxx/distributions/beta_bernoulli.hh b/cxx/distributions/beta_bernoulli.hh index b9ea1ae..30c5135 100644 --- a/cxx/distributions/beta_bernoulli.hh +++ b/cxx/distributions/beta_bernoulli.hh @@ -8,68 +8,68 @@ #include "util_math.hh" class BetaBernoulli : public Distribution { - public: - double alpha = 1; // hyperparameter - double beta = 1; // hyperparameter - int s = 0; // sum of observed values - std::mt19937* prng; + public: + double alpha = 1; // hyperparameter + double beta = 1; // hyperparameter + int s = 0; // sum of observed values + std::mt19937* prng; - std::vector alpha_grid; - std::vector beta_grid; + std::vector alpha_grid; + std::vector beta_grid; - // BetaBernoulli does not take ownership of prng. - BetaBernoulli(std::mt19937 *prng) { - this->prng = prng; - alpha_grid = log_linspace(1e-4, 1e4, 10, true); - beta_grid = log_linspace(1e-4, 1e4, 10, true); - } - void incorporate(const double& x){ - assert(x == 0 || x == 1); - ++N; - s += x; - } - void unincorporate(const double& x) { - assert(x == 0 || x ==1); - --N; - s -= x; - assert(0 <= s); - assert(0 <= N); - } - double logp(const double& x) const { - assert(x == 0 || x == 1); - double log_denom = log(N + alpha + beta); - double log_numer = x ? log(s + alpha) : log(N - s + beta); - return log_numer - log_denom; - } - double logp_score() const { - double v1 = lbeta(s + alpha, N - s + beta); - double v2 = lbeta(alpha, beta); - return v1 - v2; - } - double sample() { - double p = exp(logp(1)); - std::vector items {0, 1}; - std::vector weights {1-p, p}; - int idx = choice(weights, prng); - return items[idx]; - } - void transition_hyperparameters() { - std::vector logps; - std::vector> hypers; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : alpha_grid) { - for (double betat : beta_grid) { - alpha = alphat; - beta = betat; - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - hypers.push_back(std::make_pair(alpha, beta)); - } + // BetaBernoulli does not take ownership of prng. + BetaBernoulli(std::mt19937* prng) { + this->prng = prng; + alpha_grid = log_linspace(1e-4, 1e4, 10, true); + beta_grid = log_linspace(1e-4, 1e4, 10, true); + } + void incorporate(const double& x) { + assert(x == 0 || x == 1); + ++N; + s += x; + } + void unincorporate(const double& x) { + assert(x == 0 || x == 1); + --N; + s -= x; + assert(0 <= s); + assert(0 <= N); + } + double logp(const double& x) const { + assert(x == 0 || x == 1); + double log_denom = log(N + alpha + beta); + double log_numer = x ? log(s + alpha) : log(N - s + beta); + return log_numer - log_denom; + } + double logp_score() const { + double v1 = lbeta(s + alpha, N - s + beta); + double v2 = lbeta(alpha, beta); + return v1 - v2; + } + double sample() { + double p = exp(logp(1)); + std::vector items{0, 1}; + std::vector weights{1 - p, p}; + int idx = choice(weights, prng); + return items[idx]; + } + void transition_hyperparameters() { + std::vector logps; + std::vector> hypers; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : alpha_grid) { + for (double betat : beta_grid) { + alpha = alphat; + beta = betat; + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + hypers.push_back(std::make_pair(alpha, beta)); } } - int i = sample_from_logps(logps, prng); - alpha = hypers[i].first; - beta = hypers[i].second; } + int i = sample_from_logps(logps, prng); + alpha = hypers[i].first; + beta = hypers[i].second; + } }; diff --git a/cxx/distributions/bigram.hh b/cxx/distributions/bigram.hh index 7ccaec1..91f6bc8 100644 --- a/cxx/distributions/bigram.hh +++ b/cxx/distributions/bigram.hh @@ -108,18 +108,18 @@ class Bigram : public Distribution { } void transition_hyperparameters() { - std::vector logps; - std::vector alphas; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : ALPHA_GRID) { - set_alpha(alphat); - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - alphas.push_back(alphat); - } + std::vector logps; + std::vector alphas; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : ALPHA_GRID) { + set_alpha(alphat); + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + alphas.push_back(alphat); } - int i = sample_from_logps(logps, prng); - set_alpha(alphas[i]); + } + int i = sample_from_logps(logps, prng); + set_alpha(alphas[i]); } }; diff --git a/cxx/distributions/dirichlet_categorical.hh b/cxx/distributions/dirichlet_categorical.hh index f0ba646..1d1c981 100644 --- a/cxx/distributions/dirichlet_categorical.hh +++ b/cxx/distributions/dirichlet_categorical.hh @@ -5,79 +5,74 @@ #include #include #include + #include "base.hh" #include "util_math.hh" -#define ALPHA_GRID {1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0, 1000.0, 10000.0} +#define ALPHA_GRID \ + { 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0, 1000.0, 10000.0 } class DirichletCategorical : public Distribution { - public: - double alpha = 1; // hyperparameter (applies to all categories) - std::vector counts; // counts of observed categories - int n; // Total number of observations. - std::mt19937* prng; + public: + double alpha = 1; // hyperparameter (applies to all categories) + std::vector counts; // counts of observed categories + int n; // Total number of observations. + std::mt19937* prng; - // DirichletCategorical does not take ownership of prng. - DirichletCategorical(std::mt19937 *prng, int k) { // k is number of categories - this->prng = prng; - counts = std::vector(k, 0); - n = 0; - } - void incorporate(const double& x) { - assert(x >= 0 && x < counts.size()); - counts[size_t(x)] += 1; - ++n; - } - void unincorporate(const double& x) { - const size_t y = x; - assert(y < counts.size()); - counts[y] -= 1; - --n; - assert(0 <= counts[y]); - assert(0 <= n); - } - double logp(const double& x) const { - assert(x >= 0 && x < counts.size()); - const double numer = log(alpha + counts[size_t(x)]); - const double denom = log(n + alpha * counts.size()); - return numer - denom; - } - double logp_score() const { - const size_t k = counts.size(); - const double a = alpha * k; - const double lg = std::transform_reduce( - counts.cbegin(), - counts.cend(), - 0, - std::plus{}, - [&](size_t y) -> double {return lgamma(y + alpha); } - ); - return lgamma(a) - lgamma(a + n) + lg - k * lgamma(alpha); - } - double sample() { - std::vector weights(counts.size()); - std::transform( - counts.begin(), - counts.end(), - weights.begin(), - [&](size_t y) -> double { return y + alpha; } - ); - int idx = choice(weights, prng); - return double(idx); - } - void transition_hyperparameters() { - std::vector logps; - std::vector alphas; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : ALPHA_GRID) { - alpha = alphat; - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - alphas.push_back(alpha); - } + // DirichletCategorical does not take ownership of prng. + DirichletCategorical(std::mt19937* prng, + int k) { // k is number of categories + this->prng = prng; + counts = std::vector(k, 0); + n = 0; + } + void incorporate(const double& x) { + assert(x >= 0 && x < counts.size()); + counts[size_t(x)] += 1; + ++n; + } + void unincorporate(const double& x) { + const size_t y = x; + assert(y < counts.size()); + counts[y] -= 1; + --n; + assert(0 <= counts[y]); + assert(0 <= n); + } + double logp(const double& x) const { + assert(x >= 0 && x < counts.size()); + const double numer = log(alpha + counts[size_t(x)]); + const double denom = log(n + alpha * counts.size()); + return numer - denom; + } + double logp_score() const { + const size_t k = counts.size(); + const double a = alpha * k; + const double lg = std::transform_reduce( + counts.cbegin(), counts.cend(), 0, std::plus{}, + [&](size_t y) -> double { return lgamma(y + alpha); }); + return lgamma(a) - lgamma(a + n) + lg - k * lgamma(alpha); + } + double sample() { + std::vector weights(counts.size()); + std::transform(counts.begin(), counts.end(), weights.begin(), + [&](size_t y) -> double { return y + alpha; }); + int idx = choice(weights, prng); + return double(idx); + } + void transition_hyperparameters() { + std::vector logps; + std::vector alphas; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : ALPHA_GRID) { + alpha = alphat; + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + alphas.push_back(alpha); } - int i = sample_from_logps(logps, prng); - alpha = alphas[i]; } + int i = sample_from_logps(logps, prng); + alpha = alphas[i]; + } }; diff --git a/cxx/distributions/normal.hh b/cxx/distributions/normal.hh index 006d1b7..729cb26 100644 --- a/cxx/distributions/normal.hh +++ b/cxx/distributions/normal.hh @@ -3,6 +3,8 @@ #pragma once #include +#include +#include #include "base.hh" #include "util_math.hh" @@ -15,10 +17,14 @@ #define M_2PI 6.28318530717958647692528676655 #endif -#define R_GRID {0.1, 1.0, 10.0} -#define V_GRID {0.5, 1.0, 2.0, 5.0} -#define M_GRID {-1.0, 0.0, 1.0} -#define S_GRID {0.5, 1.0, 2.0} +#define R_GRID \ + { 0.1, 1.0, 10.0 } +#define V_GRID \ + { 0.5, 1.0, 2.0, 5.0 } +#define M_GRID \ + { -1.0, 0.0, 1.0 } +#define S_GRID \ + { 0.5, 1.0, 2.0 } double logZ(double r, double v, double s) { return (v + 1.0) / 2.0 * log(2.0) + 0.5 * log(M_PI) - 0.5 * log(r) - @@ -42,19 +48,19 @@ class Normal : public Distribution { double mean = 0.0; // Mean of observed values double var = 0.0; // Variance of observed values - std::mt19937 *prng; + std::mt19937* prng; // Normal does not take ownership of prng. - Normal(std::mt19937 *prng) { this->prng = prng; } + Normal(std::mt19937* prng) { this->prng = prng; } - void incorporate(const double &x) { + void incorporate(const double& x) { ++N; double old_mean = mean; mean += (x - mean) / N; var += (x - mean) * (x - old_mean); } - void unincorporate(const double &x) { + void unincorporate(const double& x) { int old_N = N; --N; if (N == 0) { @@ -67,7 +73,7 @@ class Normal : public Distribution { var -= (x - mean) * (x - old_mean); } - void posterior_hypers(double *mprime, double *sprime) const { + void posterior_hypers(double* mprime, double* sprime) const { // r' = r + N // m' = (r m + N mean) / (r + N) // C = N (var + mean^2) @@ -78,12 +84,12 @@ class Normal : public Distribution { N * (var - 2 * mean * mdelta - mdelta * mdelta); } - double logp(const double &x) const { + double logp(const double& x) const { // Based on equation (13) of GaussianInverseGamma.pdf double unused_mprime, sprime; - const_cast(this)->incorporate(x); + const_cast(this)->incorporate(x); posterior_hypers(&unused_mprime, &sprime); - const_cast(this)->unincorporate(x); + const_cast(this)->unincorporate(x); double sprime2; posterior_hypers(&unused_mprime, &sprime2); return -0.5 * log(M_2PI) + logZ(r + N + 1, v + N + 1, sprime) - @@ -141,6 +147,6 @@ class Normal : public Distribution { } // Disable copying. - Normal &operator=(const Normal &) = delete; - Normal(const Normal &) = delete; + Normal& operator=(const Normal&) = delete; + Normal(const Normal&) = delete; }; diff --git a/cxx/hirm.cc b/cxx/hirm.cc index 03fa7dc..fab7576 100644 --- a/cxx/hirm.cc +++ b/cxx/hirm.cc @@ -32,32 +32,32 @@ fflush(stdout); \ } -void single_step_irm_inference(IRM *irm, double &t_total, bool verbose) { - // TRANSITION ASSIGNMENTS. - for (const auto &[d, domain] : irm->domains) { - for (const auto item : domain->items) { - clock_t t = clock(); - irm->transition_cluster_assignment_item(d, item); - REPORT_SCORE(verbose, t, t_total, irm); - } - } - // TRANSITION DISTRIBUTION HYPERPARAMETERS. - for (const auto &[r, relation]: irm->relations) { - for (const auto &[c, distribution]: relation->clusters) { - clock_t t = clock(); - distribution->transition_hyperparameters(); - REPORT_SCORE(verbose, t, t_total, irm); - } +void single_step_irm_inference(IRM* irm, double& t_total, bool verbose) { + // TRANSITION ASSIGNMENTS. + for (const auto& [d, domain] : irm->domains) { + for (const auto item : domain->items) { + clock_t t = clock(); + irm->transition_cluster_assignment_item(d, item); + REPORT_SCORE(verbose, t, t_total, irm); } - // TRANSITION ALPHA. - for (const auto &[d, domain] : irm->domains) { + } + // TRANSITION DISTRIBUTION HYPERPARAMETERS. + for (const auto& [r, relation] : irm->relations) { + for (const auto& [c, distribution] : relation->clusters) { clock_t t = clock(); - domain->crp.transition_alpha(); + distribution->transition_hyperparameters(); REPORT_SCORE(verbose, t, t_total, irm); } + } + // TRANSITION ALPHA. + for (const auto& [d, domain] : irm->domains) { + clock_t t = clock(); + domain->crp.transition_alpha(); + REPORT_SCORE(verbose, t, t_total, irm); + } } -void inference_irm(IRM *irm, int iters, int timeout, bool verbose) { +void inference_irm(IRM* irm, int iters, int timeout, bool verbose) { clock_t t_begin = clock(); double t_total = 0; for (int i = 0; i < iters; ++i) { @@ -66,25 +66,25 @@ void inference_irm(IRM *irm, int iters, int timeout, bool verbose) { } } -void inference_hirm(HIRM *hirm, int iters, int timeout, bool verbose) { +void inference_hirm(HIRM* hirm, int iters, int timeout, bool verbose) { clock_t t_begin = clock(); double t_total = 0; for (int i = 0; i < iters; ++i) { CHECK_TIMEOUT(timeout, t_begin); // TRANSITION RELATIONS. - for (const auto &[r, rc] : hirm->relation_to_code) { + for (const auto& [r, rc] : hirm->relation_to_code) { clock_t t = clock(); hirm->transition_cluster_assignment_relation(r); REPORT_SCORE(verbose, t, t_total, hirm); } // TRANSITION IRMs. - for (const auto &[t, irm] : hirm->irms) { + for (const auto& [t, irm] : hirm->irms) { single_step_irm_inference(irm, t_total, verbose); } } } -int main(int argc, char **argv) { +int main(int argc, char** argv) { cxxopts::Options options("hirm", "Run a hierarchical infinite relational model."); options.add_options()("help", "show help message")( @@ -145,7 +145,7 @@ int main(int argc, char **argv) { if (mode == "irm") { std::cout << "selected model is IRM" << std::endl; - IRM *irm; + IRM* irm; // Load if (path_clusters.empty()) { irm = new IRM(schema, &prng); @@ -171,7 +171,7 @@ int main(int argc, char **argv) { if (mode == "hirm") { std::cout << "selected model is HIRM" << std::endl; - HIRM *hirm; + HIRM* hirm; // Load if (path_clusters.empty()) { hirm = new HIRM(schema, &prng); diff --git a/cxx/hirm.hh b/cxx/hirm.hh index 21f9918..b5d1fef 100644 --- a/cxx/hirm.hh +++ b/cxx/hirm.hh @@ -40,11 +40,11 @@ class CRP { std::unordered_map> tables; // map from table id to set of customers std::unordered_map assignments; // map from customer to table id - std::mt19937 *prng; + std::mt19937* prng; - CRP(std::mt19937 *prng) { this->prng = prng; } + CRP(std::mt19937* prng) { this->prng = prng; } - void incorporate(const T_item &item, int table) { + void incorporate(const T_item& item, int table) { assert(!assignments.contains(item)); if (!tables.contains(table)) { tables[table] = std::unordered_set(); @@ -53,7 +53,7 @@ class CRP { assignments[item] = table; ++N; } - void unincorporate(const T_item &item) { + void unincorporate(const T_item& item) { assert(assignments.contains(item)); int table = assignments.at(item); tables.at(table).erase(item); @@ -68,7 +68,7 @@ class CRP { std::vector items(crp_dist.size()); std::vector weights(crp_dist.size()); int i = 0; - for (const auto &[table, weight] : crp_dist) { + for (const auto& [table, weight] : crp_dist) { items[i] = table; weights[i] = weight; ++i; @@ -88,7 +88,7 @@ class CRP { double logp_score() const { double term1 = tables.size() * log(alpha); double term2 = 0; - for (const auto &[table, customers] : tables) { + for (const auto& [table, customers] : tables) { term2 += lgamma(customers.size()); } double term3 = lgamma(alpha); @@ -102,7 +102,7 @@ class CRP { return dist; } int t_max = 0; - for (const auto &[table, customers] : tables) { + for (const auto& [table, customers] : tables) { dist[table] = customers.size(); t_max = std::max(table, t_max); } @@ -117,7 +117,7 @@ class CRP { if (dist.at(table) == 0) { dist.at(table) = alpha; int t_max = 0; - for (const auto &[table, weight] : dist) { + for (const auto& [table, weight] : dist) { t_max = std::max(table, t_max); } dist.erase(t_max); @@ -130,7 +130,7 @@ class CRP { } std::vector grid = log_linspace(1. / N, N + 1, 20, true); std::vector logps; - for (const double &g : grid) { + for (const double& g : grid) { this->alpha = g; double logp_g = logp_score(); logps.push_back(logp_g); @@ -145,13 +145,13 @@ class Domain { const std::string name; // human-readable name std::unordered_set items; // set of items CRP crp; // clustering model for items - std::mt19937 *prng; + std::mt19937* prng; - Domain(const std::string &name, std::mt19937 *prng) : name(name), crp(prng) { + Domain(const std::string& name, std::mt19937* prng) : name(name), crp(prng) { assert(!name.empty()); this->prng = prng; } - void incorporate(const T_item &item, int table = -1) { + void incorporate(const T_item& item, int table = -1) { if (items.contains(item)) { assert(table == -1); } else { @@ -160,7 +160,7 @@ class Domain { crp.incorporate(item, t); } } - void unincorporate(const T_item &item) { + void unincorporate(const T_item& item) { printf("Not implemented\n"); exit(EXIT_FAILURE); // assert(items.count(item) == 1); @@ -171,11 +171,11 @@ class Domain { // items.erase(item); // } } - int get_cluster_assignment(const T_item &item) const { + int get_cluster_assignment(const T_item& item) const { assert(items.contains(item)); return crp.assignments.at(item); } - void set_cluster_assignment_gibbs(const T_item &item, int table) { + void set_cluster_assignment_gibbs(const T_item& item, int table) { assert(items.contains(item)); assert(crp.assignments.at(item) != table); crp.unincorporate(item); @@ -185,7 +185,7 @@ class Domain { return crp.tables_weights(); } std::unordered_map tables_weights_gibbs( - const T_item &item) const { + const T_item& item) const { int table = get_cluster_assignment(item); return crp.tables_weights_gibbs(table); } @@ -198,9 +198,9 @@ class Relation { // Distribution over the relation's codomain. const std::string distribution; // list of domain pointers - const std::vector domains; + const std::vector domains; // map from cluster multi-index to Distribution pointer - std::unordered_map, Distribution *, + std::unordered_map, Distribution*, VectorIntHash> clusters; // map from item to observed data @@ -211,15 +211,15 @@ class Relation { std::string, std::unordered_map>> data_r; - std::mt19937 *prng; + std::mt19937* prng; - Relation(const std::string &name, const std::string &distribution, - const std::vector &domains, std::mt19937 *prng) + Relation(const std::string& name, const std::string& distribution, + const std::vector& domains, std::mt19937* prng) : name(name), distribution(distribution), domains(domains) { assert(!domains.empty()); assert(!name.empty()); this->prng = prng; - for (const Domain *const d : domains) { + for (const Domain* const d : domains) { this->data_r[d->name] = std::unordered_map>(); } @@ -234,13 +234,13 @@ class Relation { T_relation get_T_relation() { T_relation trel; trel.distribution = distribution; - for (const auto &d : domains) { + for (const auto& d : domains) { trel.domains.push_back(d->name); } return trel; } - void incorporate(const T_items &items, double value) { + void incorporate(const T_items& items, double value) { assert(!data.contains(items)); data[items] = value; for (int i = 0; i < std::ssize(domains); ++i) { @@ -262,7 +262,7 @@ class Relation { clusters.at(z)->incorporate(value); } - void unincorporate(const T_items &items) { + void unincorporate(const T_items& items) { printf("Not implemented\n"); exit(EXIT_FAILURE); // auto x = data.at(items); @@ -285,7 +285,7 @@ class Relation { // data.erase(items); } - std::vector get_cluster_assignment(const T_items &items) const { + std::vector get_cluster_assignment(const T_items& items) const { assert(items.size() == domains.size()); std::vector z(domains.size()); for (int i = 0; i < std::ssize(domains); ++i) { @@ -294,9 +294,9 @@ class Relation { return z; } - std::vector get_cluster_assignment_gibbs(const T_items &items, - const Domain &domain, - const T_item &item, + std::vector get_cluster_assignment_gibbs(const T_items& items, + const Domain& domain, + const T_item& item, int table) const { assert(items.size() == domains.size()); std::vector z(domains.size()); @@ -315,9 +315,9 @@ class Relation { // Implementation of approximate Gibbs data probabilities (faster). - double logp_gibbs_approx_current(const Domain &domain, const T_item &item) { + double logp_gibbs_approx_current(const Domain& domain, const T_item& item) { double logp = 0.; - for (const T_items &items : data_r.at(domain.name).at(item)) { + for (const T_items& items : data_r.at(domain.name).at(item)) { double x = data.at(items); T_items z = get_cluster_assignment(items); auto cluster = clusters.at(z); @@ -329,10 +329,10 @@ class Relation { return logp; } - double logp_gibbs_approx_variant(const Domain &domain, const T_item &item, + double logp_gibbs_approx_variant(const Domain& domain, const T_item& item, int table) { double logp = 0.; - for (const T_items &items : data_r.at(domain.name).at(item)) { + for (const T_items& items : data_r.at(domain.name).at(item)) { double x = data.at(items); T_items z = get_cluster_assignment_gibbs(items, domain, item, table); double lp; @@ -347,7 +347,7 @@ class Relation { return logp; } - double logp_gibbs_approx(const Domain &domain, const T_item &item, + double logp_gibbs_approx(const Domain& domain, const T_item& item, int table) { int table_current = domain.get_cluster_assignment(item); return table_current == table @@ -359,29 +359,29 @@ class Relation { std::unordered_map const, std::vector, VectorIntHash> - get_cluster_to_items_list(Domain const &domain, const T_item &item) { + get_cluster_to_items_list(Domain const& domain, const T_item& item) { std::unordered_map, std::vector, VectorIntHash> m; - for (const T_items &items : data_r.at(domain.name).at(item)) { + for (const T_items& items : data_r.at(domain.name).at(item)) { T_items z = get_cluster_assignment(items); m[z].push_back(items); } return m; } - double logp_gibbs_exact_current(const std::vector &items_list) { + double logp_gibbs_exact_current(const std::vector& items_list) { assert(!items_list.empty()); T_items z = get_cluster_assignment(items_list[0]); auto cluster = clusters.at(z); double logp0 = cluster->logp_score(); - for (const T_items &items : items_list) { + for (const T_items& items : items_list) { double x = data.at(items); // assert(z == get_cluster_assignment(items)); cluster->unincorporate(x); } double logp1 = cluster->logp_score(); - for (const T_items &items : items_list) { + for (const T_items& items : items_list) { double x = data.at(items); cluster->incorporate(x); } @@ -389,25 +389,25 @@ class Relation { return logp0 - logp1; } - double logp_gibbs_exact_variant(const Domain &domain, const T_item &item, + double logp_gibbs_exact_variant(const Domain& domain, const T_item& item, int table, - const std::vector &items_list) { + const std::vector& items_list) { assert(!items_list.empty()); T_items z = get_cluster_assignment_gibbs(items_list[0], domain, item, table); BetaBernoulli aux(prng); - Distribution *cluster = + Distribution* cluster = clusters.contains(z) ? clusters.at(z) : &aux; // auto cluster = self.clusters.get(z, self.aux()) double logp0 = cluster->logp_score(); - for (const T_items &items : items_list) { + for (const T_items& items : items_list) { // assert(z == get_cluster_assignment_gibbs(items, domain, item, table)); double x = data.at(items); cluster->incorporate(x); } const double logp1 = cluster->logp_score(); - for (const T_items &items : items_list) { + for (const T_items& items : items_list) { double x = data.at(items); cluster->unincorporate(x); } @@ -415,16 +415,16 @@ class Relation { return logp1 - logp0; } - std::vector logp_gibbs_exact(const Domain &domain, const T_item &item, + std::vector logp_gibbs_exact(const Domain& domain, const T_item& item, std::vector tables) { auto cluster_to_items_list = get_cluster_to_items_list(domain, item); int table_current = domain.get_cluster_assignment(item); std::vector logps; // size this? logps.reserve(tables.size()); double lp_cluster; - for (const int &table : tables) { + for (const int& table : tables) { double lp_table = 0; - for (const auto &[z, items_list] : cluster_to_items_list) { + for (const auto& [z, items_list] : cluster_to_items_list) { lp_cluster = (table == table_current) ? logp_gibbs_exact_current(items_list) @@ -436,7 +436,7 @@ class Relation { return logps; } - double logp(const T_items &items, double value) { + double logp(const T_items& items, double value) { // TODO: Falsely assumes cluster assignments of items // from same domain are identical, see note in hirm.py assert(items.size() == domains.size()); @@ -444,7 +444,7 @@ class Relation { std::vector> wght_list; std::vector> indx_list; for (int i = 0; i < std::ssize(domains); ++i) { - Domain *domain = domains.at(i); + Domain* domain = domains.at(i); T_item item = items.at(i); std::vector t_list; std::vector w_list; @@ -458,7 +458,7 @@ class Relation { auto tables_weights = domain->tables_weights(); double Z = log(domain->crp.alpha + domain->crp.N); int idx = 0; - for (const auto &[t, w] : tables_weights) { + for (const auto& [t, w] : tables_weights) { t_list.push_back(t); w_list.push_back(log(w) - Z); i_list.push_back(idx++); @@ -470,7 +470,7 @@ class Relation { indx_list.push_back(i_list); } std::vector logps; - for (const auto &indexes : product(indx_list)) { + for (const auto& indexes : product(indx_list)) { assert(indexes.size() == domains.size()); std::vector z; z.reserve(domains.size()); @@ -482,7 +482,7 @@ class Relation { logp_w += wi; } BetaBernoulli aux(prng); - Distribution *cluster = + Distribution* cluster = clusters.contains(z) ? clusters.at(z) : &aux; double logp_z = cluster->logp(value); double logp_zw = logp_z + logp_w; @@ -493,17 +493,17 @@ class Relation { double logp_score() const { double logp = 0.0; - for (const auto &[_, cluster] : clusters) { + for (const auto& [_, cluster] : clusters) { logp += cluster->logp_score(); } return logp; } - void set_cluster_assignment_gibbs(const Domain &domain, const T_item &item, + void set_cluster_assignment_gibbs(const Domain& domain, const T_item& item, int table) { int table_current = domain.get_cluster_assignment(item); assert(table != table_current); - for (const T_items &items : data_r.at(domain.name).at(item)) { + for (const T_items& items : data_r.at(domain.name).at(item)) { double x = data.at(items); // Remove from current cluster. T_items z_prev = get_cluster_assignment(items); @@ -528,28 +528,28 @@ class Relation { // Caller should invoke domain.set_cluster_gibbs } - bool has_observation(const Domain &domain, const T_item &item) { + bool has_observation(const Domain& domain, const T_item& item) { return data_r.at(domain.name).contains(item); } // Disable copying. - Relation &operator=(const Relation &) = delete; - Relation(const Relation &) = delete; + Relation& operator=(const Relation&) = delete; + Relation(const Relation&) = delete; }; class IRM { public: - T_schema schema; // schema of relations - std::unordered_map domains; // map from name to Domain - std::unordered_map + T_schema schema; // schema of relations + std::unordered_map domains; // map from name to Domain + std::unordered_map relations; // map from name to Relation std::unordered_map> domain_to_relations; // reverse map - std::mt19937 *prng; + std::mt19937* prng; - IRM(const T_schema &schema, std::mt19937 *prng) { + IRM(const T_schema& schema, std::mt19937* prng) { this->prng = prng; - for (const auto &[name, relation] : schema) { + for (const auto& [name, relation] : schema) { this->add_relation(name, relation); } } @@ -563,45 +563,45 @@ class IRM { } } - void incorporate(const std::string &r, const T_items &items, double value) { + void incorporate(const std::string& r, const T_items& items, double value) { relations.at(r)->incorporate(items, value); } - void unincorporate(const std::string &r, const T_items &items) { + void unincorporate(const std::string& r, const T_items& items) { relations.at(r)->unincorporate(items); } void transition_cluster_assignments_all() { - for (const auto &[d, domain] : domains) { + for (const auto& [d, domain] : domains) { for (const T_item item : domain->items) { transition_cluster_assignment_item(d, item); } } } - void transition_cluster_assignments(const std::vector &ds) { - for (const std::string &d : ds) { + void transition_cluster_assignments(const std::vector& ds) { + for (const std::string& d : ds) { for (const T_item item : domains.at(d)->items) { transition_cluster_assignment_item(d, item); } } } - void transition_cluster_assignment_item(const std::string &d, - const T_item &item) { - Domain *domain = domains.at(d); + void transition_cluster_assignment_item(const std::string& d, + const T_item& item) { + Domain* domain = domains.at(d); auto crp_dist = domain->tables_weights_gibbs(item); // Compute probability of each table. std::vector tables; std::vector logps; tables.reserve(crp_dist.size()); logps.reserve(crp_dist.size()); - for (const auto &[table, n_customers] : crp_dist) { + for (const auto& [table, n_customers] : crp_dist) { tables.push_back(table); logps.push_back(log(n_customers)); } - for (const auto &r : domain_to_relations.at(d)) { - Relation *relation = relations.at(r); + for (const auto& r : domain_to_relations.at(d)) { + Relation* relation = relations.at(r); if (relation->has_observation(*domain, item)) { std::vector lp_relation = relation->logp_gibbs_exact(*domain, item, tables); @@ -619,8 +619,8 @@ class IRM { T_item choice = tables[idx]; // Move to new table (if necessary). if (choice != domain->get_cluster_assignment(item)) { - for (const std::string &r : domain_to_relations.at(d)) { - Relation *relation = relations.at(r); + for (const std::string& r : domain_to_relations.at(d)) { + Relation* relation = relations.at(r); if (relation->has_observation(*domain, item)) { relation->set_cluster_assignment_gibbs(*domain, item, choice); } @@ -629,8 +629,8 @@ class IRM { } } - double logp(const std::vector> - &observations) { + double logp(const std::vector>& + observations) { std::unordered_map> relation_items_seen; std::unordered_map> @@ -643,17 +643,17 @@ class IRM { std::unordered_map>>> cluster_universe; // Compute all cluster combinations. - for (const auto &[r, items, value] : observations) { + for (const auto& [r, items, value] : observations) { // Assert observation is unique. assert(!relation_items_seen[r].contains(items)); relation_items_seen[r].insert(items); // Process each (domain, item) in the observations. - Relation *relation = relations.at(r); + Relation* relation = relations.at(r); int arity = relation->domains.size(); assert(std::ssize(items) == arity); for (int i = 0; i < arity; ++i) { // Skip if (domain, item) processed. - Domain *domain = relation->domains.at(i); + Domain* domain = relation->domains.at(i); T_item item = items.at(i); if (domain_item_seen[domain->name].contains(item)) { assert(cluster_universe[domain->name].contains(item)); @@ -677,7 +677,7 @@ class IRM { auto tables_weights = domain->tables_weights(); double Z = log(domain->crp.alpha + domain->crp.N); size_t idx = 0; - for (const auto &[t, w] : tables_weights) { + for (const auto& [t, w] : tables_weights) { t_list.push_back(t); w_list.push_back(log(w) - Z); i_list.push_back(idx++); @@ -698,7 +698,7 @@ class IRM { std::vector items_product = product(index_universe); std::vector logps; // reserve size logps.reserve(index_universe.size()); - for (const T_items &indexes : items_product) { + for (const T_items& indexes : items_product) { double logp_indexes = 0; // Compute weight of cluster assignments. double weight = 0.0; @@ -707,19 +707,19 @@ class IRM { } logp_indexes += weight; // Compute weight of data given cluster assignments. - for (const auto &[r, items, value] : observations) { - Relation *relation = relations.at(r); + for (const auto& [r, items, value] : observations) { + Relation* relation = relations.at(r); std::vector z; z.reserve(domains.size()); for (int i = 0; i < std::ssize(relation->domains); ++i) { - Domain *domain = relation->domains.at(i); + Domain* domain = relation->domains.at(i); T_item item = items.at(i); - auto &[loc, t_list] = cluster_universe.at(domain->name).at(item); + auto& [loc, t_list] = cluster_universe.at(domain->name).at(item); T_item t = t_list.at(indexes.at(loc)); z.push_back(t); } BetaBernoulli aux(prng); - Distribution *cluster = + Distribution* cluster = relation->clusters.contains(z) ? relation->clusters.at(z) : &aux; logp_indexes += cluster->logp(value); } @@ -730,21 +730,21 @@ class IRM { double logp_score() const { double logp_score_crp = 0.0; - for (const auto &[d, domain] : domains) { + for (const auto& [d, domain] : domains) { logp_score_crp += domain->crp.logp_score(); } double logp_score_relation = 0.0; - for (const auto &[r, relation] : relations) { + for (const auto& [r, relation] : relations) { logp_score_relation += relation->logp_score(); } return logp_score_crp + logp_score_relation; } - void add_relation(const std::string &name, const T_relation &relation) { + void add_relation(const std::string& name, const T_relation& relation) { assert(!schema.contains(name)); assert(!relations.contains(name)); - std::vector doms; - for (const auto &d : relation.domains) { + std::vector doms; + for (const auto& d : relation.domains) { if (domains.count(d) == 0) { assert(domain_to_relations.count(d) == 0); domains[d] = new Domain(d, prng); @@ -757,12 +757,12 @@ class IRM { schema[name] = relation; } - void remove_relation(const std::string &name) { + void remove_relation(const std::string& name) { std::unordered_set ds; - for (const Domain *const domain : relations.at(name)->domains) { + for (const Domain* const domain : relations.at(name)->domains) { ds.insert(domain->name); } - for (const auto &d : ds) { + for (const auto& d : ds) { domain_to_relations.at(d).erase(name); // TODO: Remove r from domains.at(d)->items if (domain_to_relations.at(d).empty()) { @@ -777,79 +777,79 @@ class IRM { } // Disable copying. - IRM &operator=(const IRM &) = delete; - IRM(const IRM &) = delete; + IRM& operator=(const IRM&) = delete; + IRM(const IRM&) = delete; }; class HIRM { public: - T_schema schema; // schema of relations - std::unordered_map irms; // map from cluster id to IRM + T_schema schema; // schema of relations + std::unordered_map irms; // map from cluster id to IRM std::unordered_map relation_to_code; // map from relation name to code std::unordered_map code_to_relation; // map from code to relation CRP crp; // clustering model for relations - std::mt19937 *prng; + std::mt19937* prng; - HIRM(const T_schema &schema, std::mt19937 *prng) : crp(prng) { + HIRM(const T_schema& schema, std::mt19937* prng) : crp(prng) { this->prng = prng; - for (const auto &[name, relation] : schema) { + for (const auto& [name, relation] : schema) { this->add_relation(name, relation); } } - void incorporate(const std::string &r, const T_items &items, double value) { - IRM *irm = relation_to_irm(r); + void incorporate(const std::string& r, const T_items& items, double value) { + IRM* irm = relation_to_irm(r); irm->incorporate(r, items, value); } - void unincorporate(const std::string &r, const T_items &items) { - IRM *irm = relation_to_irm(r); + void unincorporate(const std::string& r, const T_items& items) { + IRM* irm = relation_to_irm(r); irm->unincorporate(r, items); } - int relation_to_table(const std::string &r) { + int relation_to_table(const std::string& r) { int rc = relation_to_code.at(r); return crp.assignments.at(rc); } - IRM *relation_to_irm(const std::string &r) { + IRM* relation_to_irm(const std::string& r) { int rc = relation_to_code.at(r); int table = crp.assignments.at(rc); return irms.at(table); } - Relation *get_relation(const std::string &r) { - IRM *irm = relation_to_irm(r); + Relation* get_relation(const std::string& r) { + IRM* irm = relation_to_irm(r); return irm->relations.at(r); } void transition_cluster_assignments_all() { - for (const auto &[r, rc] : relation_to_code) { + for (const auto& [r, rc] : relation_to_code) { transition_cluster_assignment_relation(r); } } - void transition_cluster_assignments(const std::vector &rs) { - for (const auto &r : rs) { + void transition_cluster_assignments(const std::vector& rs) { + for (const auto& r : rs) { transition_cluster_assignment_relation(r); } } - void transition_cluster_assignment_relation(const std::string &r) { + void transition_cluster_assignment_relation(const std::string& r) { int rc = relation_to_code.at(r); int table_current = crp.assignments.at(rc); - Relation *relation = get_relation(r); + Relation* relation = get_relation(r); T_relation t_relation = relation->get_T_relation(); auto crp_dist = crp.tables_weights_gibbs(table_current); std::vector tables; std::vector logps; - int *table_aux = nullptr; - IRM *irm_aux = nullptr; + int* table_aux = nullptr; + IRM* irm_aux = nullptr; // Compute probabilities of each table. - for (const auto &[table, n_customers] : crp_dist) { - IRM *irm; + for (const auto& [table, n_customers] : crp_dist) { + IRM* irm; if (!irms.contains(table)) { irm = new IRM({}, prng); assert(table_aux == nullptr); assert(irm_aux == nullptr); - table_aux = (int *)malloc(sizeof(*table_aux)); + table_aux = (int*)malloc(sizeof(*table_aux)); *table_aux = table; irm_aux = irm; } else { @@ -857,7 +857,7 @@ class HIRM { } if (table != table_current) { irm->add_relation(r, t_relation); - for (const auto &[items, value] : relation->data) { + for (const auto& [items, value] : relation->data) { irm->incorporate(r, items, value); } } @@ -871,8 +871,8 @@ class HIRM { T_item choice = tables[idx]; // Remove relation from all other tables. - for (const auto &[table, customers] : crp.tables) { - IRM *irm = irms.at(table); + for (const auto& [table, customers] : crp.tables) { + IRM* irm = irms.at(table); if (table != choice) { assert(irm->relations.count(r) == 1); irm->remove_relation(r); @@ -896,18 +896,18 @@ class HIRM { crp.unincorporate(rc); crp.incorporate(rc, choice); assert(irms.size() == crp.tables.size()); - for (const auto &[table, irm] : irms) { + for (const auto& [table, irm] : irms) { assert(crp.tables.contains(table)); } } - void set_cluster_assignment_gibbs(const std::string &r, int table) { + void set_cluster_assignment_gibbs(const std::string& r, int table) { assert(irms.size() == crp.tables.size()); int rc = relation_to_code.at(r); int table_current = crp.assignments.at(rc); - Relation *relation = get_relation(r); + Relation* relation = get_relation(r); T_relation trel = relation->get_T_relation(); - IRM *irm = relation_to_irm(r); + IRM* irm = relation_to_irm(r); auto observations = relation->data; // Remove from current IRM. irm->remove_relation(r); @@ -922,19 +922,19 @@ class HIRM { } irm = irms.at(table); irm->add_relation(r, trel); - for (const auto &[items, value] : observations) { + for (const auto& [items, value] : observations) { irm->incorporate(r, items, value); } // Update CRP. crp.unincorporate(rc); crp.incorporate(rc, table); assert(irms.size() == crp.tables.size()); - for (const auto &[table, irm] : irms) { + for (const auto& [table, irm] : irms) { assert(crp.tables.contains(table)); } } - void add_relation(const std::string &name, const T_relation &rel) { + void add_relation(const std::string& name, const T_relation& rel) { assert(!schema.contains(name)); schema[name] = rel; int offset = @@ -955,7 +955,7 @@ class HIRM { relation_to_code[name] = rc; code_to_relation[rc] = name; } - void remove_relation(const std::string &name) { + void remove_relation(const std::string& name) { schema.erase(name); int rc = relation_to_code.at(name); int table = crp.assignments.at(rc); @@ -963,7 +963,7 @@ class HIRM { crp.unincorporate(rc); irms.at(table)->remove_relation(name); if (singleton) { - IRM *irm = irms.at(table); + IRM* irm = irms.at(table); assert(irm->relations.empty()); irms.erase(table); delete irm; @@ -972,12 +972,12 @@ class HIRM { code_to_relation.erase(rc); } - double logp(const std::vector> - &observations) { + double logp(const std::vector>& + observations) { std::unordered_map>> obs_dict; - for (const auto &[r, items, value] : observations) { + for (const auto& [r, items, value] : observations) { int rc = relation_to_code.at(r); int table = crp.assignments.at(rc); if (!obs_dict.contains(table)) { @@ -986,7 +986,7 @@ class HIRM { obs_dict.at(table).push_back({r, items, value}); } double logp = 0.0; - for (const auto &[t, o] : obs_dict) { + for (const auto& [t, o] : obs_dict) { logp += irms.at(t)->logp(o); } return logp; @@ -995,19 +995,19 @@ class HIRM { double logp_score() { double logp_score_crp = crp.logp_score(); double logp_score_irms = 0.0; - for (const auto &[table, irm] : irms) { + for (const auto& [table, irm] : irms) { logp_score_irms += irm->logp_score(); } return logp_score_crp + logp_score_irms; } ~HIRM() { - for (const auto &[table, irm] : irms) { + for (const auto& [table, irm] : irms) { delete irm; } } // Disable copying. - HIRM &operator=(const HIRM &) = delete; - HIRM(const HIRM &) = delete; + HIRM& operator=(const HIRM&) = delete; + HIRM(const HIRM&) = delete; }; diff --git a/cxx/tests/test_hirm_animals.cc b/cxx/tests/test_hirm_animals.cc index 26a672a..fa458e8 100644 --- a/cxx/tests/test_hirm_animals.cc +++ b/cxx/tests/test_hirm_animals.cc @@ -9,7 +9,7 @@ #include "util_io.hh" #include "util_math.hh" -int main(int argc, char **argv) { +int main(int argc, char** argv) { srand(1); std::mt19937 prng(1); @@ -30,12 +30,12 @@ int main(int argc, char **argv) { incorporate_observations(hirm, encoding_unary, observations_unary); printf("--- incorporated observations --- \n"); int n_obs_unary = 0; - for (const auto &[z, irm] : hirm.irms) { - for (const auto &[r, relation] : irm->relations) { + for (const auto& [z, irm] : hirm.irms) { + for (const auto& [r, relation] : irm->relations) { n_obs_unary += relation->data.size(); } } - assert(n_obs_unary == observations_unary.size()); + assert(n_obs_unary == std::ssize(observations_unary)); hirm.transition_cluster_assignments_all(); hirm.transition_cluster_assignments_all(); @@ -45,15 +45,15 @@ int main(int argc, char **argv) { printf("--- set cluster assignments --- \n"); for (int i = 0; i < 20; i++) { hirm.transition_cluster_assignments_all(); - for (const auto &[t, irm] : hirm.irms) { + for (const auto& [t, irm] : hirm.irms) { irm->transition_cluster_assignments_all(); - for (const auto &[d, domain] : irm->domains) { + for (const auto& [d, domain] : irm->domains) { domain->crp.transition_alpha(); } } hirm.crp.transition_alpha(); printf("%d %f [", i, hirm.logp_score()); - for (const auto &[t, customers] : hirm.crp.tables) { + for (const auto& [t, customers] : hirm.crp.tables) { printf("%ld ", customers.size()); } printf("]\n"); @@ -71,7 +71,7 @@ int main(int argc, char **argv) { std::string path_clusters = path_base + ".hirm"; to_txt(path_clusters, hirm, encoding_unary); - auto &enc = std::get<0>(encoding_unary); + auto& enc = std::get<0>(encoding_unary); // Marginally normalized. int persiancat = enc["animal"]["persiancat"]; @@ -118,16 +118,16 @@ int main(int argc, char **argv) { assert(hirm.irms.size() == hirx.irms.size()); // Check IRMs agree. - for (const auto &[table, irm] : hirm.irms) { + for (const auto& [table, irm] : hirm.irms) { auto irx = hirx.irms.at(table); // Check log scores agree. - for (const auto &[d, dm] : irm->domains) { + for (const auto& [d, dm] : irm->domains) { auto dx = irx->domains.at(d); dx->crp.alpha = dm->crp.alpha; } assert(abs(irx->logp_score() - irm->logp_score()) < 1e-8); // Check domains agree. - for (const auto &[d, dm] : irm->domains) { + for (const auto& [d, dm] : irm->domains) { auto dx = irx->domains.at(d); assert(dm->items == dx->items); assert(dm->crp.assignments == dx->crp.assignments); @@ -136,12 +136,12 @@ int main(int argc, char **argv) { assert(dm->crp.alpha == dx->crp.alpha); } // Check relations agree. - for (const auto &[r, rm] : irm->relations) { + for (const auto& [r, rm] : irm->relations) { auto rx = irx->relations.at(r); assert(rm->data == rx->data); assert(rm->data_r == rx->data_r); assert(rm->clusters.size() == rx->clusters.size()); - for (const auto &[z, clusterm] : rm->clusters) { + for (const auto& [z, clusterm] : rm->clusters) { auto clusterx = rx->clusters.at(z); assert(clusterm->N == clusterx->N); } diff --git a/cxx/tests/test_irm_two_relations.cc b/cxx/tests/test_irm_two_relations.cc index 8300c65..143278b 100644 --- a/cxx/tests/test_irm_two_relations.cc +++ b/cxx/tests/test_irm_two_relations.cc @@ -15,7 +15,7 @@ #include "util_io.hh" #include "util_math.hh" -int main(int argc, char **argv) { +int main(int argc, char** argv) { std::string path_base = "assets/two_relations"; int seed = 1; int iters = 2; @@ -25,11 +25,11 @@ int main(int argc, char **argv) { std::string path_schema = path_base + ".schema"; std::cout << "loading schema from " << path_schema << std::endl; auto schema = load_schema(path_schema); - for (auto const &[relation_name, relation] : schema) { + for (auto const& [relation_name, relation] : schema) { printf("relation: %s, ", relation_name.c_str()); printf("distribution: %s, ", relation.distribution.c_str()); printf("domains: "); - for (auto const &domain : relation.domains) { + for (auto const& domain : relation.domains) { printf("%s ", domain.c_str()); } printf("\n"); @@ -45,7 +45,7 @@ int main(int argc, char **argv) { printf("running for %d iterations\n", iters); for (int i = 0; i < iters; i++) { irm.transition_cluster_assignments_all(); - for (auto const &[d, domain] : irm.domains) { + for (auto const& [d, domain] : irm.domains) { domain->crp.transition_alpha(); } double x = irm.logp_score(); @@ -75,7 +75,7 @@ int main(int argc, char **argv) { std::vector> indexes{ {code_item_0_D1, code_item_10_D1, code_item_novel}, {code_item_0_D1, code_item_10_D2, code_item_novel}}; - for (const auto &l : product(indexes)) { + for (const auto& l : product(indexes)) { assert(l.size() == 2); auto x1 = l.at(0); auto x2 = l.at(1); @@ -88,7 +88,7 @@ int main(int argc, char **argv) { assert(abs(exp(p0) - expected_p0[x1].at(x2)) < .1); } - for (const auto &l : + for (const auto& l : std::vector>{{0, 10, 100}, {110, 10, 100}}) { auto x1 = l.at(0); auto x2 = l.at(1); @@ -104,14 +104,14 @@ int main(int argc, char **argv) { IRM irx({}, &prng); from_txt(&irx, path_schema, path_obs, path_clusters); // Check log scores agree. - for (const auto &d : {"D1", "D2"}) { + for (const auto& d : {"D1", "D2"}) { auto dm = irm.domains.at(d); auto dx = irx.domains.at(d); dx->crp.alpha = dm->crp.alpha; } assert(abs(irx.logp_score() - irm.logp_score()) < 1e-8); // Check domains agree. - for (const auto &d : {"D1", "D2"}) { + for (const auto& d : {"D1", "D2"}) { auto dm = irm.domains.at(d); auto dx = irx.domains.at(d); assert(dm->items == dx->items); @@ -121,13 +121,13 @@ int main(int argc, char **argv) { assert(dm->crp.alpha == dx->crp.alpha); } // Check relations agree. - for (const auto &r : {"R1", "R2"}) { + for (const auto& r : {"R1", "R2"}) { auto rm = irm.relations.at(r); auto rx = irx.relations.at(r); assert(rm->data == rx->data); assert(rm->data_r == rx->data_r); assert(rm->clusters.size() == rx->clusters.size()); - for (const auto &[z, clusterm] : rm->clusters) { + for (const auto& [z, clusterm] : rm->clusters) { auto clusterx = rx->clusters.at(z); assert(clusterm->N == clusterx->N); } diff --git a/cxx/tests/test_misc.cc b/cxx/tests/test_misc.cc index 8eebcaa..5d3b739 100644 --- a/cxx/tests/test_misc.cc +++ b/cxx/tests/test_misc.cc @@ -18,7 +18,7 @@ #include "util_io.hh" #include "util_math.hh" -int main(int argc, char **argv) { +int main(int argc, char** argv) { srand(1); std::mt19937 prng(1); @@ -68,10 +68,10 @@ int main(int argc, char **argv) { printf("%f\n", crp.logp_score()); crp.incorporate(ali, 0); std::cout << "tables count 10 " << crp.tables.count(10) << std::endl; - for (auto const &i : crp.tables[0]) { + for (auto const& i : crp.tables[0]) { std::cout << i << " "; } - for (auto const &i : crp.tables[1]) { + for (auto const& i : crp.tables[1]) { std::cout << i << " "; } printf("\n"); @@ -82,18 +82,18 @@ int main(int argc, char **argv) { printf("=== tables_weights\n"); auto tables_weights = crp.tables_weights(); - for (auto &tw : tables_weights) { + for (auto& tw : tables_weights) { printf("table %d weight %f\n", tw.first, tw.second); } printf("=== tables_weights_gibbs\n"); auto tables_weights_gibbs = crp.tables_weights_gibbs(1); - for (auto &tw : tables_weights_gibbs) { + for (auto& tw : tables_weights_gibbs) { printf("table %d weight %f\n", tw.first, tw.second); } printf("==== tables_weights_gibbs_singleton\n"); auto tables_weights_gibbs_singleton = crp.tables_weights_gibbs(12); - for (auto &tw : tables_weights_gibbs_singleton) { + for (auto& tw : tables_weights_gibbs_singleton) { printf("table %d weight %f\n", tw.first, tw.second); } printf("==== log probability\n"); @@ -106,17 +106,17 @@ int main(int argc, char **argv) { T_item salman = 1; T_item mansour = 2; d.incorporate(salman); - for (auto &item : d.items) { + for (auto& item : d.items) { printf("item %d: ", item); } d.set_cluster_assignment_gibbs(salman, 12); d.incorporate(salman); d.incorporate(mansour, 5); - for (auto &item : d.items) { + for (auto& item : d.items) { printf("item %d: ", item); } // d.unincorporate(salman); - for (auto &item : d.items) { + for (auto& item : d.items) { printf("item %d: ", item); } // d.unincorporate(relation2, salman); @@ -126,9 +126,9 @@ int main(int argc, char **argv) { std::unordered_map> m; m[1].insert(10); m[1] = std::unordered_set(); - for (auto &ir : m) { + for (auto& ir : m) { printf("%d\n", ir.first); - for (auto &x : ir.second) { + for (auto& x : ir.second) { printf("%d\n", x); } } @@ -193,14 +193,14 @@ int main(int argc, char **argv) { }; IRM irm(schema1, &prng); - for (auto const &kv : irm.domains) { + for (auto const& kv : irm.domains) { printf("%s %s; ", kv.first.c_str(), kv.second->name.c_str()); for (auto const r : irm.domain_to_relations.at(kv.first)) { printf("%s ", r.c_str()); } printf("\n"); } - for (auto const &kv : irm.relations) { + for (auto const& kv : irm.relations) { printf("%s ", kv.first.c_str()); for (auto const d : kv.second->domains) { printf("%s ", d->name.c_str()); @@ -210,11 +210,11 @@ int main(int argc, char **argv) { printf("==== READING IO ===== \n"); auto schema = load_schema("assets/animals.binary.schema"); - for (auto const &i : schema) { + for (auto const& i : schema) { printf("relation: %s\n", i.first.c_str()); printf("distribution: %s\n", i.second.distribution.c_str()); printf("domains: "); - for (auto const &j : i.second.domains) { + for (auto const& j : i.second.domains) { printf("%s ", j.c_str()); } printf("\n"); @@ -224,7 +224,7 @@ int main(int argc, char **argv) { auto observations = load_observations("assets/animals.binary.obs"); auto encoding = encode_observations(schema, observations); auto item_to_code = std::get<0>(encoding); - for (auto const &i : observations) { + for (auto const& i : observations) { auto relation = std::get<0>(i); auto value = std::get<2>(i); auto item = std::get<1>(i); @@ -232,7 +232,7 @@ int main(int argc, char **argv) { printf("%1.f ", value); int counter = 0; T_items items_code; - for (auto const &item : std::get<1>(i)) { + for (auto const& item : std::get<1>(i)) { auto domain = schema.at(relation).domains[counter]; counter += 1; auto code = item_to_code.at(domain).at(item); @@ -246,7 +246,7 @@ int main(int argc, char **argv) { for (int i = 0; i < 4; i++) { irm3.transition_cluster_assignments({"animal", "feature"}); irm3.transition_cluster_assignments_all(); - for (auto const &[d, domain] : irm3.domains) { + for (auto const& [d, domain] : irm3.domains) { domain->crp.transition_alpha(); } double x = irm3.logp_score(); @@ -257,7 +257,7 @@ int main(int argc, char **argv) { to_txt(path_clusters, irm3, encoding); auto rel = irm3.relations.at("has"); - auto &enc = std::get<0>(encoding); + auto& enc = std::get<0>(encoding); auto lp0 = rel->logp({enc["animal"]["tail"], enc["animal"]["bat"]}, 0); auto lp1 = rel->logp({enc["animal"]["tail"], enc["animal"]["bat"]}, 1); auto lp_01 = logsumexp({lp0, lp1}); @@ -272,7 +272,7 @@ int main(int argc, char **argv) { irm4.domains.at("animal")->crp.alpha = irm3.domains.at("animal")->crp.alpha; irm4.domains.at("feature")->crp.alpha = irm3.domains.at("feature")->crp.alpha; assert(abs(irm3.logp_score() - irm4.logp_score()) < 1e-8); - for (const auto &d : {"animal", "feature"}) { + for (const auto& d : {"animal", "feature"}) { auto d3 = irm3.domains.at(d); auto d4 = irm4.domains.at(d); assert(d3->items == d4->items); @@ -281,13 +281,13 @@ int main(int argc, char **argv) { assert(d3->crp.N == d4->crp.N); assert(d3->crp.alpha == d4->crp.alpha); } - for (const auto &r : {"has"}) { + for (const auto& r : {"has"}) { auto r3 = irm3.relations.at(r); auto r4 = irm4.relations.at(r); assert(r3->data == r4->data); assert(r3->data_r == r4->data_r); assert(r3->clusters.size() == r4->clusters.size()); - for (const auto &[z, cluster3] : r3->clusters) { + for (const auto& [z, cluster3] : r3->clusters) { auto cluster4 = r4->clusters.at(z); assert(cluster3->N == cluster4->N); } diff --git a/cxx/tests/test_util_math.cc b/cxx/tests/test_util_math.cc deleted file mode 100644 index a724465..0000000 --- a/cxx/tests/test_util_math.cc +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2021 MIT Probabilistic Computing Project -// Apache License, Version 2.0, refer to LICENSE.txt - -#include -#include -#include - -#include "util_math.hh" - -int main(int argc, char **argv) { - std::vector> x{{1}, {2, 3}, {1, 10, 11}}; - - auto cartesian = product(x); - assert(cartesian.size() == 6); - assert((cartesian.at(0) == std::vector{1, 2, 1})); - assert((cartesian.at(1) == std::vector{1, 2, 10})); - assert((cartesian.at(2) == std::vector{1, 2, 11})); - assert((cartesian.at(3) == std::vector{1, 3, 1})); - assert((cartesian.at(4) == std::vector{1, 3, 10})); - assert((cartesian.at(5) == std::vector{1, 3, 11})); - - x.push_back({}); - cartesian = product(x); - assert(cartesian.size() == 0); -} diff --git a/cxx/util_hash.hh b/cxx/util_hash.hh index d7d29f5..3658e5d 100644 --- a/cxx/util_hash.hh +++ b/cxx/util_hash.hh @@ -12,9 +12,9 @@ // TODO(emilyaf): Is this necessary? Is it so that vectors have the same hash // values regardless of the order of their elements? struct VectorIntHash { - int operator()(const std::vector &V) const { + int operator()(const std::vector& V) const { int hash = V.size(); - for (auto &i : V) { + for (auto& i : V) { hash ^= i + 0x9e3779b9 + (hash << 6) + (hash >> 2); } return hash; @@ -22,9 +22,9 @@ struct VectorIntHash { }; struct VectorStringHash { - int operator()(const std::vector &V) const { + int operator()(const std::vector& V) const { int hash = V.size(); - for (auto &s : V) { + for (auto& s : V) { hash ^= std::hash{}(s) + 0x9e3779b9 + (hash << 6) + (hash >> 2); } diff --git a/cxx/util_io.cc b/cxx/util_io.cc index 3831728..32dc361 100644 --- a/cxx/util_io.cc +++ b/cxx/util_io.cc @@ -12,7 +12,7 @@ #include #include -T_schema load_schema(const std::string &path) { +T_schema load_schema(const std::string& path) { std::ifstream fp(path, std::ifstream::in); assert(fp.good()); @@ -36,7 +36,7 @@ T_schema load_schema(const std::string &path) { return schema; } -T_observations load_observations(const std::string &path) { +T_observations load_observations(const std::string& path) { std::ifstream fp(path, std::ifstream::in); assert(fp.good()); @@ -64,26 +64,26 @@ T_observations load_observations(const std::string &path) { } // Assumes that T_item is integer. -T_encoding encode_observations(const T_schema &schema, - const T_observations &observations) { +T_encoding encode_observations(const T_schema& schema, + const T_observations& observations) { // Counter and encoding maps. std::map domain_item_counter; T_encoding_f item_to_code; T_encoding_r code_to_item; // Create a counter of items for each domain. - for (const auto &[r, relation] : schema) { - for (const std::string &domain : relation.domains) { + for (const auto& [r, relation] : schema) { + for (const std::string& domain : relation.domains) { domain_item_counter[domain] = 0; item_to_code[domain] = std::map(); code_to_item[domain] = std::map(); } } // Create the codes for each item. - for (const T_observation &i : observations) { + for (const T_observation& i : observations) { std::string relation = std::get<0>(i); std::vector items = std::get<1>(i); int counter = 0; - for (const std::string &item : items) { + for (const std::string& item : items) { // Obtain domain that item belongs to. std::string domain = schema.at(relation).domains.at(counter); // Compute its code, if necessary. @@ -99,13 +99,13 @@ T_encoding encode_observations(const T_schema &schema, return std::make_pair(item_to_code, code_to_item); } -void incorporate_observations(IRM &irm, const T_encoding &encoding, - const T_observations &observations) { +void incorporate_observations(IRM& irm, const T_encoding& encoding, + const T_observations& observations) { T_encoding_f item_to_code = std::get<0>(encoding); - for (const auto &[relation, items, value] : observations) { + for (const auto& [relation, items, value] : observations) { int counter = 0; T_items items_e; - for (const std::string &item : items) { + for (const std::string& item : items) { std::string domain = irm.schema.at(relation).domains[counter]; counter += 1; int code = item_to_code.at(domain).at(item); @@ -115,13 +115,13 @@ void incorporate_observations(IRM &irm, const T_encoding &encoding, } } -void incorporate_observations(HIRM &hirm, const T_encoding &encoding, - const T_observations &observations) { +void incorporate_observations(HIRM& hirm, const T_encoding& encoding, + const T_observations& observations) { T_encoding_f item_to_code = std::get<0>(encoding); - for (const auto &[relation, items, value] : observations) { + for (const auto& [relation, items, value] : observations) { int counter = 0; T_items items_e; - for (const std::string &item : items) { + for (const std::string& item : items) { std::string domain = hirm.schema.at(relation).domains[counter]; counter += 1; int code = item_to_code.at(domain).at(item); @@ -131,17 +131,17 @@ void incorporate_observations(HIRM &hirm, const T_encoding &encoding, } } -void to_txt(std::ostream &fp, const IRM &irm, const T_encoding &encoding) { +void to_txt(std::ostream& fp, const IRM& irm, const T_encoding& encoding) { T_encoding_r code_to_item = std::get<1>(encoding); - for (const auto &[d, domain] : irm.domains) { + for (const auto& [d, domain] : irm.domains) { auto i0 = domain->crp.tables.begin(); auto i1 = domain->crp.tables.end(); std::map> tables(i0, i1); - for (const auto &[table, items] : tables) { + for (const auto& [table, items] : tables) { fp << domain->name << " "; fp << table << " "; int i = 1; - for (const T_item &item : items) { + for (const T_item& item : items) { fp << code_to_item.at(domain->name).at(item); if (i++ < std::ssize(items)) { fp << " "; @@ -152,12 +152,12 @@ void to_txt(std::ostream &fp, const IRM &irm, const T_encoding &encoding) { } } -void to_txt(std::ostream &fp, const HIRM &hirm, const T_encoding &encoding) { +void to_txt(std::ostream& fp, const HIRM& hirm, const T_encoding& encoding) { // Write the relation clusters. auto i0 = hirm.crp.tables.begin(); auto i1 = hirm.crp.tables.end(); std::map> tables(i0, i1); - for (const auto &[table, rcs] : tables) { + for (const auto& [table, rcs] : tables) { fp << table << " "; int i = 1; for (const T_item rc : rcs) { @@ -171,8 +171,8 @@ void to_txt(std::ostream &fp, const HIRM &hirm, const T_encoding &encoding) { fp << "\n"; // Write the IRMs. int j = 0; - for (const auto &[table, rcs] : tables) { - const IRM *const irm = hirm.irms.at(table); + for (const auto& [table, rcs] : tables) { + const IRM* const irm = hirm.irms.at(table); fp << "irm=" << table << "\n"; to_txt(fp, *irm, encoding); if (j < std::ssize(tables) - 1) { @@ -182,16 +182,16 @@ void to_txt(std::ostream &fp, const HIRM &hirm, const T_encoding &encoding) { } } -void to_txt(const std::string &path, const IRM &irm, - const T_encoding &encoding) { +void to_txt(const std::string& path, const IRM& irm, + const T_encoding& encoding) { std::ofstream fp(path); assert(fp.good()); to_txt(fp, irm, encoding); fp.close(); } -void to_txt(const std::string &path, const HIRM &hirm, - const T_encoding &encoding) { +void to_txt(const std::string& path, const HIRM& hirm, + const T_encoding& encoding) { std::ofstream fp(path); assert(fp.good()); to_txt(fp, hirm, encoding); @@ -199,7 +199,7 @@ void to_txt(const std::string &path, const HIRM &hirm, } std::map>> -load_clusters_irm(const std::string &path) { +load_clusters_irm(const std::string& path) { std::ifstream fp(path, std::ifstream::in); assert(fp.good()); @@ -225,7 +225,7 @@ load_clusters_irm(const std::string &path) { return clusters; } -int isnumeric(const std::string &s) { +int isnumeric(const std::string& s) { for (char c : s) { if (!isdigit(c)) { return false; @@ -236,14 +236,17 @@ int isnumeric(const std::string &s) { std::tuple>, // x[table] = {relation // list} - std::map>>> // x[table][domain][table] - // = - // {item - // list} + std::map< + int, + std::map< + std::string, + std::map>>> // x[table][domain][table] + // = + // {item + // list} > -load_clusters_hirm(const std::string &path) { +load_clusters_hirm(const std::string& path) { std::ifstream fp(path, std::ifstream::in); assert(fp.good()); @@ -297,7 +300,7 @@ load_clusters_hirm(const std::string &path) { stream >> second; assert(second.size() > 0); assert(isnumeric(second)); - std::string &domain = first; + std::string& domain = first; int table = std::stoi(second); std::vector items; for (std::string item; stream >> item;) { @@ -312,15 +315,15 @@ load_clusters_hirm(const std::string &path) { } assert(relations.size() == irms.size()); - for (const auto &[t, rs] : relations) { + for (const auto& [t, rs] : relations) { assert(irms.count(t) == 1); } fp.close(); return std::make_pair(relations, irms); } -void from_txt(IRM *const irm, const std::string &path_schema, - const std::string &path_obs, const std::string &path_clusters) { +void from_txt(IRM* const irm, const std::string& path_schema, + const std::string& path_obs, const std::string& path_clusters) { // Load the data. T_schema schema = load_schema(path_schema); T_observations observations = load_observations(path_obs); @@ -331,16 +334,16 @@ void from_txt(IRM *const irm, const std::string &path_schema, assert(irm->domains.empty()); assert(irm->relations.empty()); assert(irm->domain_to_relations.empty()); - for (const auto &[r, ds] : schema) { + for (const auto& [r, ds] : schema) { irm->add_relation(r, ds); } // Add the domain entities with fixed clustering. T_encoding_f item_to_code = std::get<0>(encoding); - for (const auto &[domain, tables] : clusters) { + for (const auto& [domain, tables] : clusters) { assert(irm->domains.at(domain)->items.size() == 0); - for (const auto &[table, items] : tables) { + for (const auto& [table, items] : tables) { assert(0 <= table); - for (const std::string &item : items) { + for (const std::string& item : items) { T_item code = item_to_code.at(domain).at(item); irm->domains.at(domain)->incorporate(code, table); } @@ -350,8 +353,8 @@ void from_txt(IRM *const irm, const std::string &path_schema, incorporate_observations(*irm, encoding, observations); } -void from_txt(HIRM *const hirm, const std::string &path_schema, - const std::string &path_obs, const std::string &path_clusters) { +void from_txt(HIRM* const hirm, const std::string& path_schema, + const std::string& path_obs, const std::string& path_clusters) { T_schema schema = load_schema(path_schema); T_observations observations = load_observations(path_obs); T_encoding encoding = encode_observations(schema, observations); @@ -361,16 +364,16 @@ void from_txt(HIRM *const hirm, const std::string &path_schema, assert(hirm->irms.empty()); assert(hirm->relation_to_code.empty()); assert(hirm->code_to_relation.empty()); - for (const auto &[r, ds] : schema) { + for (const auto& [r, ds] : schema) { hirm->add_relation(r, ds); assert(hirm->irms.size() == hirm->crp.tables.size()); hirm->set_cluster_assignment_gibbs(r, -1); } // Add each IRM. - for (const auto &[table, rs] : relations) { + for (const auto& [table, rs] : relations) { assert(hirm->irms.size() == hirm->crp.tables.size()); // Add relations to the IRM. - for (const std::string &r : rs) { + for (const std::string& r : rs) { assert(hirm->irms.size() == hirm->crp.tables.size()); int table_current = hirm->relation_to_table(r); if (table_current != table) { @@ -380,15 +383,15 @@ void from_txt(HIRM *const hirm, const std::string &path_schema, } // Add the domain entities with fixed clustering to this IRM. // TODO: Duplicated code with from_txt(IRM) - IRM *irm = hirm->irms.at(table); + IRM* irm = hirm->irms.at(table); auto clusters = irms.at(table); assert(irm->relations.size() == rs.size()); T_encoding_f item_to_code = std::get<0>(encoding); - for (const auto &[domain, tables] : clusters) { + for (const auto& [domain, tables] : clusters) { assert(irm->domains.at(domain)->items.size() == 0); - for (const auto &[t, items] : tables) { + for (const auto& [t, items] : tables) { assert(0 <= t); - for (const std::string &item : items) { + for (const std::string& item : items) { int code = item_to_code.at(domain).at(item); irm->domains.at(domain)->incorporate(code, t); } diff --git a/cxx/util_io.hh b/cxx/util_io.hh index 95a2be0..a4c8286 100644 --- a/cxx/util_io.hh +++ b/cxx/util_io.hh @@ -16,37 +16,40 @@ typedef std::unordered_map T_assignment; typedef std::unordered_map T_assignments; // disk IO -T_schema load_schema(const std::string &path); -T_observations load_observations(const std::string &path); -T_encoding encode_observations(const T_schema &schema, - const T_observations &observations); - -void incorporate_observations(IRM &irm, const T_encoding &encoding, - const T_observations &observations); -void incorporate_observations(HIRM &hirm, const T_encoding &encoding, - const T_observations &observations); - -void to_txt(const std::string &path, const IRM &irm, - const T_encoding &encoding); -void to_txt(const std::string &path, const HIRM &irm, - const T_encoding &encoding); -void to_txt(std::ostream &fp, const IRM &irm, const T_encoding &encoding); -void to_txt(std::ostream &fp, const HIRM &irm, const T_encoding &encoding); +T_schema load_schema(const std::string& path); +T_observations load_observations(const std::string& path); +T_encoding encode_observations(const T_schema& schema, + const T_observations& observations); + +void incorporate_observations(IRM& irm, const T_encoding& encoding, + const T_observations& observations); +void incorporate_observations(HIRM& hirm, const T_encoding& encoding, + const T_observations& observations); + +void to_txt(const std::string& path, const IRM& irm, + const T_encoding& encoding); +void to_txt(const std::string& path, const HIRM& irm, + const T_encoding& encoding); +void to_txt(std::ostream& fp, const IRM& irm, const T_encoding& encoding); +void to_txt(std::ostream& fp, const HIRM& irm, const T_encoding& encoding); std::map>> -load_clusters_irm(const std::string &path); +load_clusters_irm(const std::string& path); std::tuple>, // x[table] = {relation // list} - std::map>>> // x[table][domain][table] - // = - // {item - // list} + std::map< + int, + std::map< + std::string, + std::map>>> // x[table][domain][table] + // = + // {item + // list} > -load_clusters_hirm(const std::string &path); +load_clusters_hirm(const std::string& path); -void from_txt(IRM *const irm, const std::string &path_schema, - const std::string &path_obs, const std::string &path_clusters); -void from_txt(HIRM *const irm, const std::string &path_schema, - const std::string &path_obs, const std::string &path_clusters); +void from_txt(IRM* const irm, const std::string& path_schema, + const std::string& path_obs, const std::string& path_clusters); +void from_txt(HIRM* const irm, const std::string& path_schema, + const std::string& path_obs, const std::string& path_clusters); diff --git a/cxx/util_math.cc b/cxx/util_math.cc index 691fef8..ea8dd54 100644 --- a/cxx/util_math.cc +++ b/cxx/util_math.cc @@ -31,7 +31,7 @@ std::vector log_linspace(double start, double stop, int num, return v; } -std::vector log_normalize(const std::vector &weights) { +std::vector log_normalize(const std::vector& weights) { double Z = logsumexp(weights); std::vector result(weights.size()); for (int i = 0; i < std::ssize(weights); i++) { @@ -40,7 +40,7 @@ std::vector log_normalize(const std::vector &weights) { return result; } -double logsumexp(const std::vector &weights) { +double logsumexp(const std::vector& weights) { // Get the max index. int max_index = std::distance( weights.cbegin(), std::max_element(weights.cbegin(), weights.cend())); @@ -57,13 +57,13 @@ double logsumexp(const std::vector &weights) { return log1p(s) + m; } -int choice(const std::vector &weights, std::mt19937 *prng) { +int choice(const std::vector& weights, std::mt19937* prng) { std::discrete_distribution dist(weights.begin(), weights.end()); int idx = dist(*prng); return idx; } -int log_choice(const std::vector &weights, std::mt19937 *prng) { +int log_choice(const std::vector& weights, std::mt19937* prng) { std::vector log_weights_norm = log_normalize(weights); std::vector weights_norm; for (double w : log_weights_norm) { @@ -73,20 +73,20 @@ int log_choice(const std::vector &weights, std::mt19937 *prng) { } std::vector> product( - const std::vector> &lists) { + const std::vector>& lists) { // https://rosettacode.org/wiki/Cartesian_product_of_two_or_more_lists#C.2B.2B std::vector> result; - for (const auto &l : lists) { + for (const auto& l : lists) { if (l.empty()) { return result; } } - for (const int &e : lists[0]) { + for (const int& e : lists[0]) { result.push_back({e}); } for (size_t i = 1; i < lists.size(); ++i) { std::vector> temp; - for (std::vector &e : result) { + for (std::vector& e : result) { for (int f : lists[i]) { std::vector e_tmp = e; e_tmp.push_back(f); @@ -98,7 +98,8 @@ std::vector> product( return result; } -int sample_from_logps(const std::vector &log_probs, std::mt19937 *prng) { +int sample_from_logps(const std::vector& log_probs, + std::mt19937* prng) { double max_lp = *std::max_element(log_probs.begin(), log_probs.end()); std::vector weights; for (auto lp : log_probs) { diff --git a/cxx/util_math.hh b/cxx/util_math.hh index b2be4ea..0c91858 100644 --- a/cxx/util_math.hh +++ b/cxx/util_math.hh @@ -11,14 +11,14 @@ double lbeta(double z, double w); std::vector linspace(double start, double stop, int num, bool endpoint); std::vector log_linspace(double start, double stop, int num, bool endpoint); -std::vector log_normalize(const std::vector &weights); -double logsumexp(const std::vector &weights); +std::vector log_normalize(const std::vector& weights); +double logsumexp(const std::vector& weights); -int choice(const std::vector &weights, std::mt19937 *prng); -int log_choice(const std::vector &weights, std::mt19937 *prng); +int choice(const std::vector& weights, std::mt19937* prng); +int log_choice(const std::vector& weights, std::mt19937* prng); std::vector> product( - const std::vector> &lists); + const std::vector>& lists); // Given a vector of log probabilities, return a sample. -int sample_from_logps(const std::vector &log_probs, std::mt19937 *prng); +int sample_from_logps(const std::vector& log_probs, std::mt19937* prng); diff --git a/cxx/util_math_test.cc b/cxx/util_math_test.cc index 22fe928..1fd5911 100644 --- a/cxx/util_math_test.cc +++ b/cxx/util_math_test.cc @@ -118,16 +118,14 @@ BOOST_AUTO_TEST_CASE(test_product) { } } -BOOST_AUTO_TEST_CASE(test_sample_from_logps) -{ +BOOST_AUTO_TEST_CASE(test_sample_from_logps) { // One of these entries isn't like the others, ... std::vector logps = {-1.0, -2.0, 20.0, -3.0}; std::mt19937 prng; BOOST_TEST(2 == sample_from_logps(logps, &prng)); } -BOOST_AUTO_TEST_CASE(test_sample_from_logps_all_small) -{ +BOOST_AUTO_TEST_CASE(test_sample_from_logps_all_small) { std::vector logps = {-10.0, -20.0, -30.0, -40.0, -50.0}; std::mt19937 prng; BOOST_TEST(0 == sample_from_logps(logps, &prng));