Skip to content

Commit

Permalink
add myst file
Browse files Browse the repository at this point in the history
  • Loading branch information
cluhmann committed Dec 7, 2023
1 parent f0c54eb commit 436644d
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 86 deletions.
127 changes: 57 additions & 70 deletions examples/generalized_linear_models/GLM-out-of-sample-predictions.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ kernelspec:
(GLM-out-of-sample-predictions)=
# Out-Of-Sample Predictions

:::{post} December, 2022
:::{post} December, 2023
:tags: generalized linear model, logistic regression, out of sample predictions, patsy
:category: beginner
:::
Expand All @@ -23,13 +23,11 @@ import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import pymc as pm
import seaborn as sns
from scipy.special import expit as inverse_logit
from sklearn.metrics import RocCurveDisplay, accuracy_score, auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import RocCurveDisplay, auc, roc_curve
```

```{code-cell} ipython3
Expand All @@ -55,7 +53,7 @@ beta_x2 = -1
beta_interaction = 2
z = intercept + beta_x1 * x1 + beta_x2 * x2 + beta_interaction * x1 * x2
p = inverse_logit(z)
# note binimial with n=1 is equal to a bernoulli
# note binomial with n=1 is equal to a Bernoulli
y = rng.binomial(n=1, p=p, size=n)
df = pd.DataFrame(dict(x1=x1, x2=x2, y=y))
df.head()
Expand All @@ -81,26 +79,33 @@ ax.set(title="Sample Data", xlim=(-9, 9), ylim=(-9, 9));
## Prepare Data for Modeling

```{code-cell} ipython3
y, x = patsy.dmatrices("y ~ x1 * x2", data=df)
y = np.asarray(y).flatten()
labels = x.design_info.column_names
x = np.asarray(x)
labels = ["Intercept", "x1", "x2", "x1:x2"]
df["Intercept"] = np.ones(len(df))
df["x1:x2"] = df["x1"] * df["x2"]
# reorder columns to be in the same order as labels
df = df[labels]
x = df.to_numpy()
```

Now we do a train-test split.

```{code-cell} ipython3
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)
indices = rng.permutation(x.shape[0])
train_prop = 0.7
train_size = int(train_prop * x.shape[0])
training_idx, test_idx = indices[:train_size], indices[train_size:]
x_train, x_test = x[training_idx, :], x[test_idx, :]
y_train, y_test = y[training_idx], y[test_idx]
```

## Define and Fit the Model

We now specify the model in PyMC.

```{code-cell} ipython3
COORDS = {"coeffs": labels}
coords = {"coeffs": labels}
with pm.Model(coords=COORDS) as model:
with pm.Model(coords=coords) as model:
# data containers
X = pm.MutableData("X", x_train)
y = pm.MutableData("y", y_train)
Expand Down Expand Up @@ -152,15 +157,15 @@ with model:
```{code-cell} ipython3
# Compute the point prediction by taking the mean and defining the category via a threshold.
p_test_pred = idata.posterior_predictive["obs"].mean(dim=["chain", "draw"])
y_test_pred = (p_test_pred >= 0.5).astype("int")
y_test_pred = (p_test_pred >= 0.5).astype("int").to_numpy()
```

## Evaluate Model

First let us compute the accuracy on the test set.

```{code-cell} ipython3
print(f"accuracy = {accuracy_score(y_true=y_test, y_pred=y_test_pred): 0.3f}")
print(f"accuracy = {np.mean(y_test==y_test_pred): 0.3f}")
```

Next, we plot the [roc curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) and compute the [auc](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve).
Expand Down Expand Up @@ -218,8 +223,13 @@ x1_grid, x2_grid, x_grid = make_grid()
with model:
# Create features on the grid.
x_grid_ext = patsy.dmatrix(formula_like="x1 * x2", data=dict(x1=x_grid[:, 0], x2=x_grid[:, 1]))
x_grid_ext = np.asarray(x_grid_ext)
x_grid_ext = np.hstack(
(
np.ones((x_grid.shape[0], 1)),
x_grid,
(x_grid[:, 0] * x_grid[:, 1]).reshape(-1, 1),
)
)
# set the observed variables
pm.set_data({"X": x_grid_ext})
# calculate pushforward values of `p`
Expand Down Expand Up @@ -278,12 +288,17 @@ Note that we have computed the model decision boundary by using the mean of the
- [Bayesian Analysis with Python (Second edition) - Chapter 4](https://github.com/aloctavodia/BAP/blob/master/code/Chp4/04_Generalizing_linear_models.ipynb)
- [Statistical Rethinking](https://xcelab.net/rm/statistical-rethinking/)

+++



+++

## Authors
- Created by [Juan Orduz](https://github.com/juanitorduz).
- Updated by [Benjamin T. Vincent](https://github.com/drbenvincent) to PyMC v4 in June 2022
- Re-executed by [Benjamin T. Vincent](https://github.com/drbenvincent) with PyMC v5 in December 2022
- Updated by [Christian Luhmann](https://github.com/cluhmann) with PyMC v5 in December 2023

+++

Expand Down

0 comments on commit 436644d

Please sign in to comment.