From c318484bf5c26ea54602b861a69d9edcd36ab909 Mon Sep 17 00:00:00 2001 From: Srinivas Vasudevan Date: Thu, 6 Jun 2024 11:37:07 -0700 Subject: [PATCH] Separate out header and source files for distributions --- cxx/distributions/BUILD | 12 ++- cxx/distributions/adapter.hh | 4 +- cxx/distributions/beta_bernoulli.cc | 63 +++++++++++ cxx/distributions/beta_bernoulli.hh | 57 ++-------- cxx/distributions/bigram.cc | 120 +++++++++++++++++++++ cxx/distributions/bigram.hh | 118 +++----------------- cxx/distributions/dirichlet_categorical.cc | 65 +++++++++++ cxx/distributions/dirichlet_categorical.hh | 66 +++--------- cxx/distributions/normal.hh | 2 +- 9 files changed, 293 insertions(+), 214 deletions(-) create mode 100644 cxx/distributions/beta_bernoulli.cc create mode 100644 cxx/distributions/bigram.cc create mode 100644 cxx/distributions/dirichlet_categorical.cc diff --git a/cxx/distributions/BUILD b/cxx/distributions/BUILD index df6b4da..61f12b5 100644 --- a/cxx/distributions/BUILD +++ b/cxx/distributions/BUILD @@ -1,14 +1,13 @@ licenses(["notice"]) - cc_library( name = "distributions", visibility = ["//:__subpackages__"], deps = [ ":adapter", ":base", - ":bigram", ":beta_bernoulli", + ":bigram", ":dirichlet_categorical", ":normal", ], @@ -31,6 +30,7 @@ cc_library( cc_library( name = "beta_bernoulli", + srcs = ["beta_bernoulli.cc"], hdrs = ["beta_bernoulli.hh"], visibility = ["//:__subpackages__"], deps = [ @@ -42,6 +42,7 @@ cc_library( cc_library( name = "bigram", + srcs = ["bigram.cc"], hdrs = ["bigram.hh"], visibility = ["//:__subpackages__"], deps = [ @@ -54,7 +55,8 @@ cc_library( cc_library( name = "dirichlet_categorical", - srcs = ["dirichlet_categorical.hh"], + srcs = ["dirichlet_categorical.cc"], + hdrs = ["dirichlet_categorical.hh"], visibility = ["//:__subpackages__"], deps = [ ":base", @@ -64,8 +66,8 @@ cc_library( cc_library( name = "normal", - hdrs = ["normal.hh"], srcs = ["normal.cc"], + hdrs = ["normal.hh"], visibility = ["//:__subpackages__"], deps = [ ":base", @@ -109,8 +111,8 @@ cc_test( name = "dirichlet_categorical_test", srcs = ["dirichlet_categorical_test.cc"], deps = [ - ":dirichlet_categorical", ":beta_bernoulli", + ":dirichlet_categorical", "@boost//:algorithm", "@boost//:test", ], diff --git a/cxx/distributions/adapter.hh b/cxx/distributions/adapter.hh index aef5a8a..b679990 100644 --- a/cxx/distributions/adapter.hh +++ b/cxx/distributions/adapter.hh @@ -56,9 +56,7 @@ class DistributionAdapter : public Distribution { return to_string(s); } - void transition_hyperparameters() { - d->transition_hyperparameters(); - } + void transition_hyperparameters() { d->transition_hyperparameters(); } ~DistributionAdapter() { delete d; } }; diff --git a/cxx/distributions/beta_bernoulli.cc b/cxx/distributions/beta_bernoulli.cc new file mode 100644 index 0000000..032e09c --- /dev/null +++ b/cxx/distributions/beta_bernoulli.cc @@ -0,0 +1,63 @@ +// Copyright 2024 +// See LICENSE.txt + +#include "distributions/beta_bernoulli.hh" + +#include + +#include "util_math.hh" + +void BetaBernoulli::incorporate(const double& x) { + assert(x == 0 || x == 1); + ++N; + s += x; +} + +void BetaBernoulli::unincorporate(const double& x) { + assert(x == 0 || x == 1); + --N; + s -= x; + assert(0 <= s); + assert(0 <= N); +} + +double BetaBernoulli::logp(const double& x) const { + assert(x == 0 || x == 1); + double log_denom = log(N + alpha + beta); + double log_numer = x ? log(s + alpha) : log(N - s + beta); + return log_numer - log_denom; +} + +double BetaBernoulli::logp_score() const { + double v1 = lbeta(s + alpha, N - s + beta); + double v2 = lbeta(alpha, beta); + return v1 - v2; +} + +double BetaBernoulli::sample() { + double p = exp(logp(1)); + std::vector items{0, 1}; + std::vector weights{1 - p, p}; + int idx = choice(weights, prng); + return items[idx]; +} + +void BetaBernoulli::transition_hyperparameters() { + std::vector logps; + std::vector> hypers; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : alpha_grid) { + for (double betat : beta_grid) { + alpha = alphat; + beta = betat; + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + hypers.push_back(std::make_pair(alpha, beta)); + } + } + } + int i = sample_from_logps(logps, prng); + alpha = hypers[i].first; + beta = hypers[i].second; +} diff --git a/cxx/distributions/beta_bernoulli.hh b/cxx/distributions/beta_bernoulli.hh index 6dcea49..d0b0ce7 100644 --- a/cxx/distributions/beta_bernoulli.hh +++ b/cxx/distributions/beta_bernoulli.hh @@ -23,57 +23,16 @@ class BetaBernoulli : public Distribution { alpha_grid = log_linspace(1e-4, 1e4, 10, true); beta_grid = log_linspace(1e-4, 1e4, 10, true); } - void incorporate(const double& x) { - assert(x == 0 || x == 1); - ++N; - s += x; - } - void unincorporate(const double& x) { - assert(x == 0 || x == 1); - --N; - s -= x; - assert(0 <= s); - assert(0 <= N); - } - double logp(const double& x) const { - assert(x == 0 || x == 1); - double log_denom = log(N + alpha + beta); - double log_numer = x ? log(s + alpha) : log(N - s + beta); - return log_numer - log_denom; - } + void incorporate(const double& x); - double logp_score() const { - double v1 = lbeta(s + alpha, N - s + beta); - double v2 = lbeta(alpha, beta); - return v1 - v2; - } + void unincorporate(const double& x); - double sample() { - double p = exp(logp(1)); - std::vector items{0, 1}; - std::vector weights{1 - p, p}; - int idx = choice(weights, prng); - return items[idx]; - } + double logp(const double& x) const; - void transition_hyperparameters() { - std::vector logps; - std::vector> hypers; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : alpha_grid) { - for (double betat : beta_grid) { - alpha = alphat; - beta = betat; - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - hypers.push_back(std::make_pair(alpha, beta)); - } - } - } - int i = sample_from_logps(logps, prng); - alpha = hypers[i].first; - beta = hypers[i].second; - } + double logp_score() const; + + double sample(); + + void transition_hyperparameters(); }; diff --git a/cxx/distributions/bigram.cc b/cxx/distributions/bigram.cc new file mode 100644 index 0000000..147b729 --- /dev/null +++ b/cxx/distributions/bigram.cc @@ -0,0 +1,120 @@ +// Copyright 2024 +// See LICENSE.txt + +#include "distributions/bigram.hh" + +#include + +#include "distributions/base.hh" + +void Bigram::assert_valid_char(const char c) const { + assert(c >= ' ' && c <= '~'); +} + +size_t Bigram::char_to_index(const char c) const { + assert_valid_char(c); + return c - ' '; +} + +char Bigram::index_to_char(const size_t i) const { + const char c = i + ' '; + assert_valid_char(c); + return c; +} + +std::vector Bigram::string_to_indices(const std::string& str) const { + // Convert the string to a vector of indices between 0 and `num_chars`, + // with a start/stop symbol at the beginning/end. + std::vector inds = {num_chars}; + for (const char& c : str) { + inds.push_back(char_to_index(c)); + } + inds.push_back(num_chars); + return inds; +} + +void Bigram::incorporate(const std::string& x) { + const std::vector indices = string_to_indices(x); + for (size_t i = 0; i != indices.size() - 1; ++i) { + transition_dists[indices[i]].incorporate(indices[i + 1]); + } + ++N; +} + +void Bigram::unincorporate(const std::string& s) { + const std::vector indices = string_to_indices(s); + for (size_t i = 0; i != indices.size() - 1; ++i) { + transition_dists[indices[i]].unincorporate(indices[i + 1]); + } + --N; +} + +double Bigram::logp(const std::string& s) const { + const std::vector indices = string_to_indices(s); + double total_logp = 0.0; + for (size_t i = 0; i != indices.size() - 1; ++i) { + total_logp += transition_dists[indices[i]].logp(indices[i + 1]); + // Incorporate each value so that subsequent probabilities are + // conditioned on it. + transition_dists[indices[i]].incorporate(indices[i + 1]); + } + for (size_t i = 0; i != indices.size() - 1; ++i) { + transition_dists[indices[i]].unincorporate(indices[i + 1]); + } + return total_logp; +} + +double Bigram::logp_score() const { + double logp = 0; + for (const auto& d : transition_dists) { + logp += d.logp_score(); + } + return logp; +} + +std::string Bigram::sample() { + std::string sampled_string; + // TODO(emilyaf): Reconsider the reserved length and maybe enforce a + // max length. + sampled_string.reserve(30); + // Sample the first character conditioned on the stop/start symbol. + size_t current_ind = num_chars; + size_t next_ind = transition_dists[current_ind].sample(); + transition_dists[current_ind].incorporate(next_ind); + current_ind = next_ind; + + // Sample additional characters until the stop/start symbol is sampled. + // Incorporate the sampled character at each loop iteration so that + // subsequent samples are conditioned on its observation. + while (current_ind != num_chars) { + sampled_string += index_to_char(current_ind); + next_ind = transition_dists[current_ind].sample(); + transition_dists[current_ind].incorporate(next_ind); + current_ind = next_ind; + } + unincorporate(sampled_string); + return sampled_string; +} + +void Bigram::set_alpha(double alphat) { + alpha = alphat; + for (auto& trans_dist : transition_dists) { + trans_dist.alpha = alpha; + } +} + +void Bigram::transition_hyperparameters() { + std::vector logps; + std::vector alphas; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : ALPHA_GRID) { + set_alpha(alphat); + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + alphas.push_back(alphat); + } + } + int i = sample_from_logps(logps, prng); + set_alpha(alphas[i]); +} diff --git a/cxx/distributions/bigram.hh b/cxx/distributions/bigram.hh index 07f633f..6fc664a 100644 --- a/cxx/distributions/bigram.hh +++ b/cxx/distributions/bigram.hh @@ -2,33 +2,19 @@ // See LICENSE.txt #pragma once -#include -#include "base.hh" -#include "dirichlet_categorical.hh" +#include "distributions/base.hh" +#include "distributions/dirichlet_categorical.hh" class Bigram : public Distribution { private: - void assert_valid_char(const char c) const { assert(c >= ' ' && c <= '~'); } - size_t char_to_index(const char c) const { - assert_valid_char(c); - return c - ' '; - } - char index_to_char(const size_t i) const { - const char c = i + ' '; - assert_valid_char(c); - return c; - } - std::vector string_to_indices(const std::string& str) const { - // Convert the string to a vector of indices between 0 and `num_chars`, - // with a start/stop symbol at the beginning/end. - std::vector inds = {num_chars}; - for (const char& c : str) { - inds.push_back(char_to_index(c)); - } - inds.push_back(num_chars); - return inds; - } + void assert_valid_char(const char c) const; + + size_t char_to_index(const char c) const; + + char index_to_char(const size_t i) const; + + std::vector string_to_indices(const std::string& str) const; public: double alpha = 1; // hyperparameter for all transition distributions. @@ -48,89 +34,17 @@ class Bigram : public Distribution { } } - void incorporate(const std::string& x) { - const std::vector indices = string_to_indices(x); - for (size_t i = 0; i != indices.size() - 1; ++i) { - transition_dists[indices[i]].incorporate(indices[i + 1]); - } - ++N; - } + void incorporate(const std::string& x); - void unincorporate(const std::string& s) { - const std::vector indices = string_to_indices(s); - for (size_t i = 0; i != indices.size() - 1; ++i) { - transition_dists[indices[i]].unincorporate(indices[i + 1]); - } - --N; - } + void unincorporate(const std::string& s); - double logp(const std::string& s) const { - const std::vector indices = string_to_indices(s); - double total_logp = 0.0; - for (size_t i = 0; i != indices.size() - 1; ++i) { - total_logp += transition_dists[indices[i]].logp(indices[i + 1]); - // Incorporate each value so that subsequent probabilities are - // conditioned on it. - transition_dists[indices[i]].incorporate(indices[i + 1]); - } - for (size_t i = 0; i != indices.size() - 1; ++i) { - transition_dists[indices[i]].unincorporate(indices[i + 1]); - } - return total_logp; - } + double logp(const std::string& s) const; - double logp_score() const { - double logp = 0; - for (const auto& d : transition_dists) { - logp += d.logp_score(); - } - return logp; - } + double logp_score() const; - std::string sample() { - std::string sampled_string; - // TODO(emilyaf): Reconsider the reserved length and maybe enforce a - // max length. - sampled_string.reserve(30); - // Sample the first character conditioned on the stop/start symbol. - size_t current_ind = num_chars; - size_t next_ind = transition_dists[current_ind].sample(); - transition_dists[current_ind].incorporate(next_ind); - current_ind = next_ind; - - // Sample additional characters until the stop/start symbol is sampled. - // Incorporate the sampled character at each loop iteration so that - // subsequent samples are conditioned on its observation. - while (current_ind != num_chars) { - sampled_string += index_to_char(current_ind); - next_ind = transition_dists[current_ind].sample(); - transition_dists[current_ind].incorporate(next_ind); - current_ind = next_ind; - } - unincorporate(sampled_string); - return sampled_string; - } + std::string sample(); - void set_alpha(double alphat) { - alpha = alphat; - for (auto& trans_dist : transition_dists) { - trans_dist.alpha = alpha; - } - } + void set_alpha(double alphat); - void transition_hyperparameters() { - std::vector logps; - std::vector alphas; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : ALPHA_GRID) { - set_alpha(alphat); - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - alphas.push_back(alphat); - } - } - int i = sample_from_logps(logps, prng); - set_alpha(alphas[i]); - } + void transition_hyperparameters(); }; diff --git a/cxx/distributions/dirichlet_categorical.cc b/cxx/distributions/dirichlet_categorical.cc new file mode 100644 index 0000000..c153ec9 --- /dev/null +++ b/cxx/distributions/dirichlet_categorical.cc @@ -0,0 +1,65 @@ +// Copyright 2024 +// See LICENSE.txt + +#include "distributions/dirichlet_categorical.hh" + +#include +#include +#include + +#include "util_math.hh" + +void DirichletCategorical::incorporate(const double& x) { + assert(x >= 0 && x < counts.size()); + counts[size_t(x)] += 1; + ++N; +} + +void DirichletCategorical::unincorporate(const double& x) { + const size_t y = x; + assert(y < counts.size()); + counts[y] -= 1; + --N; + assert(0 <= counts[y]); + assert(0 <= N); +} + +double DirichletCategorical::logp(const double& x) const { + assert(x >= 0 && x < counts.size()); + const double numer = log(alpha + counts[size_t(x)]); + const double denom = log(N + alpha * counts.size()); + return numer - denom; +} + +double DirichletCategorical::logp_score() const { + const size_t k = counts.size(); + const double a = alpha * k; + double lg = 0; + for (size_t x : counts) { + lg += lgamma(static_cast(x) + alpha); + } + return lgamma(a) - lgamma(a + N) + lg - k * lgamma(alpha); +} +double DirichletCategorical::sample() { + std::vector weights(counts.size()); + std::transform(counts.begin(), counts.end(), weights.begin(), + [&](size_t y) -> double { return y + alpha; }); + int idx = choice(weights, prng); + return double(idx); +} + +void DirichletCategorical::transition_hyperparameters() { + std::vector logps; + std::vector alphas; + // C++ doesn't yet allow range for-loops over existing variables. Sigh. + for (double alphat : ALPHA_GRID) { + alpha = alphat; + double lp = logp_score(); + if (!std::isnan(lp)) { + logps.push_back(logp_score()); + alphas.push_back(alpha); + } + } + int i = sample_from_logps(logps, prng); + alpha = alphas[i]; +} diff --git a/cxx/distributions/dirichlet_categorical.hh b/cxx/distributions/dirichlet_categorical.hh index b17d59d..0fe7150 100644 --- a/cxx/distributions/dirichlet_categorical.hh +++ b/cxx/distributions/dirichlet_categorical.hh @@ -2,12 +2,9 @@ // See LICENSE.txt #pragma once -#include -#include -#include #include -#include "base.hh" +#include "distributions/base.hh" #include "util_math.hh" #define ALPHA_GRID \ @@ -25,54 +22,15 @@ class DirichletCategorical : public Distribution { this->prng = prng; counts = std::vector(k, 0); } - void incorporate(const double& x) { - assert(x >= 0 && x < counts.size()); - counts[size_t(x)] += 1; - ++N; - } - void unincorporate(const double& x) { - const size_t y = x; - assert(y < counts.size()); - counts[y] -= 1; - --N; - assert(0 <= counts[y]); - assert(0 <= N); - } - double logp(const double& x) const { - assert(x >= 0 && x < counts.size()); - const double numer = log(alpha + counts[size_t(x)]); - const double denom = log(N + alpha * counts.size()); - return numer - denom; - } - double logp_score() const { - const size_t k = counts.size(); - const double a = alpha * k; - double lg = 0; - for (size_t x : counts) { - lg += lgamma(static_cast(x) + alpha); - } - return lgamma(a) - lgamma(a + N) + lg - k * lgamma(alpha); - } - double sample() { - std::vector weights(counts.size()); - std::transform(counts.begin(), counts.end(), weights.begin(), - [&](size_t y) -> double { return y + alpha; }); - int idx = choice(weights, prng); - return double(idx); - } - void transition_hyperparameters() { - std::vector logps; - std::vector alphas; - // C++ doesn't yet allow range for-loops over existing variables. Sigh. - for (double alphat : ALPHA_GRID) { - alpha = alphat; - double lp = logp_score(); - if (!std::isnan(lp)) { - logps.push_back(logp_score()); - alphas.push_back(alpha); - } - } - int i = sample_from_logps(logps, prng); - alpha = alphas[i]; - } + void incorporate(const double& x); + + void unincorporate(const double& x); + + double logp(const double& x) const; + + double logp_score() const; + + double sample(); + + void transition_hyperparameters(); }; diff --git a/cxx/distributions/normal.hh b/cxx/distributions/normal.hh index e230f79..aed4552 100644 --- a/cxx/distributions/normal.hh +++ b/cxx/distributions/normal.hh @@ -50,7 +50,7 @@ class Normal : public Distribution { void unincorporate(const double& x); - void posterior_hypers(double *mprime, double *sprime) const; + void posterior_hypers(double* mprime, double* sprime) const; double logp(const double& x) const;