Skip to content

Commit

Permalink
Refactor regression component
Browse files Browse the repository at this point in the history
  • Loading branch information
jessegrabowski committed Dec 12, 2024
1 parent 155dcd6 commit 8a630ce
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 28 deletions.
37 changes: 19 additions & 18 deletions pymc_experimental/model/modular/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
PRIOR_DEFAULT_KWARGS,
ColumnType,
PoolingType,
at_least_list,
get_X_data,
make_hierarchical_prior,
select_data_columns,
Expand Down Expand Up @@ -112,13 +113,8 @@ def __init__(

self.prior = prior
self.prior_params = prior_params if prior_params is not None else {}
self.pooling_columns = at_least_list(pooling_columns)

if pooling_columns is None:
pooling_columns = []
elif isinstance(pooling_columns, str):
pooling_columns = [pooling_columns]

self.pooling_columns = pooling_columns
name = name or f"Intercept(pooling_cols={pooling_columns})"

super().__init__(name=name)
Expand Down Expand Up @@ -158,7 +154,7 @@ def __init__(
pooling: PoolingType = "complete",
pooling_columns: ColumnType | None = None,
hierarchical_params: dict | None = None,
**prior_params,
prior_params: dict | None = None,
):
"""
Class to represent a regression component in a GLM model.
Expand Down Expand Up @@ -197,12 +193,13 @@ def __init__(
prior_params:
Additional keyword arguments to pass to the PyMC distribution specified by the prior argument.
"""
self.feature_columns = feature_columns
self.feature_columns = at_least_list(feature_columns)
self.pooling = pooling
self.pooling_columns = pooling_columns
self.pooling_columns = at_least_list(pooling_columns)

self.prior = prior
self.prior_params = prior_params
self.prior_params = {} if prior_params is None else prior_params
self.hierarchical_params = {} if hierarchical_params is None else hierarchical_params

name = name if name else f"Regression({feature_columns})"

Expand All @@ -213,23 +210,27 @@ def build(self, model=None):
feature_dim = f"{self.name}_features"

if feature_dim not in model.coords:
model.add_coord(feature_dim, self.X.columns)
model.add_coord(feature_dim, self.feature_columns)

with model:
X = select_data_columns(get_X_data(model), self.feature_columns)
full_X = get_X_data(model)
X = select_data_columns(self.feature_columns, model, squeeze=False)

if self.pooling == "complete":
beta = getattr(pm, self.prior)(
f"{self.name}", **self.prior_params, dims=[feature_dim]
)
prior_params = PRIOR_DEFAULT_KWARGS[self.prior].copy()
prior_params.update(self.prior_params)

beta = getattr(pm, self.prior)(f"{self.name}", **prior_params, dims=[feature_dim])
return X @ beta

beta = make_hierarchical_prior(
self.name,
self.index_data,
name=self.name,
X=full_X,
pooling=self.pooling,
pooling_columns=self.pooling_columns,
model=model,
dims=[feature_dim],
no_pooling=self.pooling == "none",
**self.hierarchical_params,
)

regression_effect = (X * beta.T).sum(axis=-1)
Expand Down
25 changes: 19 additions & 6 deletions pymc_experimental/model/modular/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def select_data_columns(
cols: str | Sequence[str] | None,
model: pm.Model | None = None,
data_name: str = "X_data",
squeeze=True,
) -> pt.TensorVariable | None:
"""
Create a tensor variable representing a subset of independent data columns.
Expand Down Expand Up @@ -72,7 +73,7 @@ def select_data_columns(
cols_idx = [model.coords["feature"].index(col) for col in cols]

# Single columns are returned as 1d arrays
if len(cols_idx) == 1:
if len(cols_idx) == 1 and squeeze:
cols_idx = cols_idx[0]

return get_X_data(model, data_name=data_name)[:, cols_idx]
Expand Down Expand Up @@ -110,6 +111,14 @@ def encode_categoricals(df, coords):
return df, coords


def at_least_list(columns: ColumnType):
if columns is None:
columns = []
elif isinstance(columns, str):
columns = [columns]
return columns


def make_level_maps(X: SharedVariable, coords: dict[str, tuple | None], ordered_levels: list[str]):
r"""
For each row of data, create a mapping between levels of a arbitrary set of levels defined by `ordered_levels`.
Expand Down Expand Up @@ -304,7 +313,7 @@ def make_partial_pooled_hierarchy(
prior_params.update(prior_kwargs)

with model:
beta = Prior(f"{name}_effect", **prior_params, dims=dims)
beta = Prior(f"{name}", **prior_params, dims=dims)

for i, (last_level, level) in enumerate(itertools.pairwise([None, *pooling_columns])):
if i == 0:
Expand Down Expand Up @@ -359,13 +368,17 @@ def make_unpooled_hierarchy(
beta = Prior(f"{name}_mu", **prior_kwargs, dims=dims)

for i, (last_level, level) in enumerate(itertools.pairwise([None, *levels])):
sigma = make_sigma(f"{name}_{level}_sigma", sigma_dist, sigma_kwargs, dims)
if i == 0:
sigma_dims = dims
else:
sigma_dims = [*dims, last_level] if dims is not None else [last_level]
beta_dims = [*dims, level] if dims is not None else [level]

sigma = make_sigma(f"{name}_{level}_effect", sigma_dist, sigma_kwargs, sigma_dims)

prior_kwargs["mu"] = beta[..., idx_maps[i]]
scale_name = "b" if prior == "Laplace" else "sigma"
prior_kwargs[scale_name] = sigma

beta_dims = [*dims, level] if dims is not None else [level]
prior_kwargs[scale_name] = sigma[..., idx_maps[i]]

beta = Prior(f"{name}_{level}_effect", **prior_kwargs, dims=beta_dims)

Expand Down
46 changes: 42 additions & 4 deletions tests/model/modular/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import pymc as pm
import pytest

from model.modular.utilities import encode_categoricals
from model.modular.utilities import at_least_list, encode_categoricals

from pymc_experimental.model.modular.components import Intercept, PoolingType
from pymc_experimental.model.modular.components import Intercept, PoolingType, Regression


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -51,5 +51,43 @@ def test_intercept(pooling: PoolingType, prior, model):
assert np.unique(x).shape[0] == 1


def test_regression():
pass
@pytest.mark.parametrize("pooling", ["partial", "none", "complete"], ids=str)
@pytest.mark.parametrize("prior", ["Normal", "Laplace", "StudentT"], ids=str)
@pytest.mark.parametrize(
"feature_columns", ["income", ["age", "income"]], ids=["single", "multiple"]
)
def test_regression(pooling: PoolingType, prior, feature_columns, model):
regression = Regression(
name=None,
feature_columns=feature_columns,
prior=prior,
pooling=pooling,
pooling_columns="city",
)

temp_model = model.copy()
xb = regression.build(temp_model)
assert f"Regression({feature_columns})_features" in temp_model.coords.keys()

if pooling != "complete":
assert f"Regression({feature_columns})_city_effect" in temp_model.named_vars
assert f"Regression({feature_columns})_city_effect_sigma" in temp_model.named_vars

if pooling == "partial":
assert (
f"Regression({feature_columns})_city_effect_offset" in temp_model.named_vars_to_dims
)
else:
assert f"Regression({feature_columns})" in temp_model.named_vars

xb_val = xb.eval()

X, beta = xb.owner.inputs[0].owner.inputs
beta_val = beta.eval()
n_features = len(at_least_list(feature_columns))

if pooling != "complete":
assert xb_val.shape[0] == len(model.coords["obs_idx"])
assert np.unique(beta_val).shape[0] == len(model.coords["city"]) * n_features
else:
assert np.unique(beta_val).shape[0] == n_features

0 comments on commit 8a630ce

Please sign in to comment.