Skip to content

Commit

Permalink
Update pickle to dill, fix various docs links and tests, update roadm…
Browse files Browse the repository at this point in the history
…ap (#311)

* update pickle to dill for saving and update notebook links

* fix tf kernels test, update roadmap, fix links in docs

* update pytorch version

* remove print

* fix indent notebook
  • Loading branch information
arnaudvl authored Aug 16, 2021
1 parent e965165 commit 65ed410
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 37 deletions.
2 changes: 1 addition & 1 deletion alibi_detect/utils/pytorch/tests/test_kernels_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,4 @@ def test_deep_kernel(deep_kernel_params):
k_xx = kernel(x, x).detach().numpy()
assert k_xy.shape == n_instances and k_xx.shape == (xshape[0], xshape[0])
assert (np.diag(k_xx) > 0.).all()
np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=6)
np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=5)
22 changes: 11 additions & 11 deletions alibi_detect/utils/saving.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# type: ignore
# TODO: need to rewrite utilities using isinstance or @singledispatch for type checking to work properly
import dill
from functools import partial
import logging
import os
import pickle
import tensorflow as tf
from tensorflow.keras.layers import Input, InputLayer
from tensorflow_probability.python.distributions.distribution import Distribution
Expand Down Expand Up @@ -95,8 +95,8 @@ def save_detector(detector: Data, filepath: str) -> None:
os.mkdir(filepath)

# save metadata
with open(os.path.join(filepath, 'meta.pickle'), 'wb') as f:
pickle.dump(detector.meta, f)
with open(os.path.join(filepath, 'meta.dill'), 'wb') as f:
dill.dump(detector.meta, f)

# save outlier detector specific parameters
if detector_name == 'OutlierAE':
Expand Down Expand Up @@ -134,8 +134,8 @@ def save_detector(detector: Data, filepath: str) -> None:
elif detector_name == 'LLR':
state_dict = state_llr(detector)

with open(os.path.join(filepath, detector_name + '.pickle'), 'wb') as f:
pickle.dump(state_dict, f)
with open(os.path.join(filepath, detector_name + '.dill'), 'wb') as f:
dill.dump(state_dict, f)

# save outlier detector specific TensorFlow models
if detector_name == 'OutlierAE':
Expand Down Expand Up @@ -197,16 +197,16 @@ def save_embedding(embed: tf.keras.Model,
if not os.path.isdir(model_dir):
os.mkdir(model_dir)
embed.save_pretrained(model_dir)
with open(os.path.join(filepath, save_dir, model_name + '.pickle'), 'wb') as f:
pickle.dump(embed_args, f)
with open(os.path.join(filepath, save_dir, model_name + '.dill'), 'wb') as f:
dill.dump(embed_args, f)


def preprocess_step_drift(cd: Union[ChiSquareDrift, ClassifierDriftTF, KSDrift, MMDDriftTF, TabularDrift]) \
-> Tuple[
Optional[Callable], Dict, Optional[Union[tf.keras.Model, tf.keras.Sequential]],
Optional[TransformerEmbedding], Dict, Optional[Callable], bool
]:
# note: need to be able to pickle tokenizers other than transformers
# note: need to be able to dill tokenizers other than transformers
preprocess_fn, preprocess_kwargs = None, {}
model, embed, embed_args, tokenizer, load_emb = None, None, {}, None, False
if isinstance(cd.preprocess_fn, partial):
Expand Down Expand Up @@ -964,7 +964,7 @@ def load_detector(filepath: str, **kwargs) -> Data:
raise ValueError('{} does not exist.'.format(filepath))

# load metadata
meta_dict = pickle.load(open(os.path.join(filepath, 'meta.pickle'), 'rb'))
meta_dict = dill.load(open(os.path.join(filepath, 'meta.dill'), 'rb'))

if 'backend' in list(meta_dict.keys()) and meta_dict['backend'] == 'pytorch':
raise NotImplementedError('Detectors with PyTorch backend are not yet supported.')
Expand All @@ -974,7 +974,7 @@ def load_detector(filepath: str, **kwargs) -> Data:
raise ValueError('{} is not supported by `load_detector`.'.format(detector_name))

# load outlier detector specific parameters
state_dict = pickle.load(open(os.path.join(filepath, detector_name + '.pickle'), 'rb'))
state_dict = dill.load(open(os.path.join(filepath, detector_name + '.dill'), 'rb'))

# initialize outlier detector
if detector_name == 'OutlierAE':
Expand Down Expand Up @@ -1456,7 +1456,7 @@ def load_text_embed(filepath: str, load_dir: str = 'model') \
-> Tuple[TransformerEmbedding, Callable]:
model_dir = os.path.join(filepath, load_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
args = pickle.load(open(os.path.join(model_dir, 'embedding.pickle'), 'rb'))
args = dill.load(open(os.path.join(model_dir, 'embedding.dill'), 'rb'))
emb = TransformerEmbedding(
model_dir, embedding_type=args['embedding_type'], layers=args['layers']
)
Expand Down
2 changes: 1 addition & 1 deletion alibi_detect/utils/tensorflow/tests/test_kernels_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,4 @@ def test_deep_kernel(deep_kernel_params):
k_xx = kernel(x, x).numpy()
assert k_xy.shape == n_instances and k_xx.shape == (xshape[0], xshape[0])
assert (np.diag(k_xx) > 0.).all()
np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=6)
np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=5)
6 changes: 3 additions & 3 deletions doc/source/overview/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ And **data drift**:

* [Least-Squares Density Difference Drift Detector](../methods/lsdddrift.ipynb)

* [Maximum Mean Discrepancy Drift Detector](../methods/mmddrift.ipynb)
* [Maximum Mean Discrepancy (MMD) Drift Detector](../methods/mmddrift.ipynb)

* [Learned Kernel MMD Drift Detector](../methods/learnedkerneldrift.ipynb)

Expand All @@ -110,13 +110,13 @@ And **data drift**:

* [Classifier Drift Detector](../methods/classifierdrift.ipynb)

* [Spot-the-diff Drift Detector](../methods/classifierdrift.ipynb)
* [Spot-the-diff Drift Detector](../methods/spotthediffdrift.ipynb)

* [Classifier and Regressor Uncertainty Drift Detectors](../methods/modeluncdrift.ipynb)

* [Online Maximum Mean Discrepancy Drift Detector](../methods/onlinemmddrift.ipynb)

* [Online Least_Squares Density Difference Drift Detector](../methods/onlinelsdddrift.ipynb)
* [Online Least-Squares Density Difference Drift Detector](../methods/onlinelsdddrift.ipynb)


## Basic Usage
Expand Down
14 changes: 7 additions & 7 deletions doc/source/overview/roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ The algorithms will cover the following data types:
* **Images**
* **Time series**, both univariate and multivariate.
* **Text**
* **Graphs**

It will also be possible to combine different algorithms in ensemble detectors.

The library **currently** covers both online and offline outlier detection algorithms for
tabular data, images and time series as well as offline adversarial detectors for
tabular data and images. Current drift detection capabilities cover mixed type tabular data, text and images.
The library **currently** covers both online and offline **outlier** detection algorithms for
tabular data, images and time series as well as offline **adversarial** detectors for
tabular data and images. Current **drift** detection capabilities cover almost any data modality such as mixed type tabular data,
text, images or graphs both in the online and offline setting, and with .

The **near term** focus will be on adding online and text drift detectors, extending the PyTorch support, and adding outlier detectors for text and mixed data types.

In the **medium term**, we intend to leverage labels in a semi-supervised setting for the
detectors and incorporate drift detection for time series.
The **near term** focus will be on adding more use case specific drift detectors, extending save/load functionality for
PyTorch detectors, concept drift detection, and adding outlier detectors for text and mixed data types.
2 changes: 1 addition & 1 deletion examples/cd_clf_cifar10.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions examples/cd_online_camelyon.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook demonstrates a typical workflow for applying online drift detectors to streams of image data. For those unfamiliar with how the online drift detectors operate in `alibi_detect` we recommend first checking out the more introductory example [`cd_online_wine.ipynb`](https://github.com/SeldonIO/alibi-detect/blob/master/examples/cd_online_wine.ipynb) where online drift detection is performed for the wine quality dataset."
"This notebook demonstrates a typical workflow for applying online drift detectors to streams of image data. For those unfamiliar with how the online drift detectors operate in `alibi_detect` we recommend first checking out the more introductory example [Online Drift Detection on the Wine Quality Dataset](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_online_wine.html) where online drift detection is performed for the wine quality dataset."
]
},
{
Expand Down Expand Up @@ -524,7 +524,7 @@
"hash": "26d4efd8bf86ae199e0cff801fa58ff781ca69d267a2f4141eff4295422fc53d"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -538,7 +538,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions examples/cd_spot_the_diff_mnist_wine.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"source": [
"# Interpretable drift detection with the spot-the-diff detector on MNIST and Wine-Quality datasets\n",
"\n",
"Under the hood drift detectors leverage a function of the data that is expected to be large when drift has occured and small when it hasn't. In the [`cd_clf_cifar10.ipynb`](https://github.com/SeldonIO/alibi-detect/blob/master/examples/cd_clf_cifar10.ipynb) example notebook we note that we can **learn** a function satisfying this property by training a classifer to distinguish reference and test samples. However we now additionally note that if the classifier is specified in a certain way then when drift is detected we can inspect the weights of the classifier to shine light on exactly which features of the data were used to distinguish reference from test samples and therefore caused drift to be detected. \n",
"Under the hood drift detectors leverage a function of the data that is expected to be large when drift has occured and small when it hasn't. In the [Learned drift detectors on CIFAR-10](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_clf_cifar10.html) example notebook we note that we can **learn** a function satisfying this property by training a classifer to distinguish reference and test samples. However we now additionally note that if the classifier is specified in a certain way then when drift is detected we can inspect the weights of the classifier to shine light on exactly which features of the data were used to distinguish reference from test samples and therefore caused drift to be detected. \n",
"\n",
"The `SpotTheDiffDrift` detector is designed to make this process straightforward. Like the `ClassifierDrift` detector, it uses a portion of the available data to train a classifier to discriminate between reference and test instances. Letting $\\hat{p}_T(x)$ represent the probability assigned by the classifier that the instance $x$ is from the test set rather than reference set, the difference here is that we use a classifier of the form $$\\text{logit}(\\hat{p}_T) = b_0 + b_1 k(x,w_1) + ... + b_Jk(x,w_J),$$ where $k(\\cdot,\\cdot)$ is a kernel specifying a notion of similarity between instances, $w_i$ are learnable *test locations* and $b_i$ are learnable regression coefficients.\n",
"\n",
Expand Down Expand Up @@ -804,7 +804,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
15 changes: 8 additions & 7 deletions examples/cd_text_imdb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"source": [
"# Text drift detection on IMDB movie reviews\n",
"\n",
"### Method\n",
"## Method\n",
"\n",
"We detect drift on text data using both the [Maximum Mean Discrepancy](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/mmddrift.html) and [Kolmogorov-Smirnov (K-S)](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/ksdrift.html) detectors. In this example notebook we will focus on detecting covariate shift $\\Delta p(x)$ as detecting predicted label distribution drift does not differ from other modalities (check [K-S](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_ks_cifar10.html#BBSDs) and [MMD](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_mmd_cifar10.html#BBSDs) drift on CIFAR-10).\n",
"\n",
Expand All @@ -16,12 +16,12 @@
"\n",
"The library contains functionality to leverage pre-trained embeddings from [HuggingFace's transformer package](https://github.com/huggingface/transformers) but also allows you to easily use your own embeddings of choice. Both options are illustrated with examples in this notebook.\n",
"\n",
"### Backend\n",
"## Backend\n",
"\n",
"The method works with both the **PyTorch** and **TensorFlow** frameworks for the statistical tests and preprocessing steps. Alibi Detect does however not install PyTorch for you. \n",
"Check the [PyTorch docs](https://pytorch.org/) how to do this.\n",
"\n",
"### Dataset\n",
"## Dataset\n",
"\n",
"Binary sentiment classification [dataset](https://ai.stanford.edu/~amaas/data/sentiment/) containing $25,000$ movie reviews for training and $25,000$ for testing. Install the `nlp` library to fetch the dataset:\n",
"\n",
Expand All @@ -47,7 +47,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load tokenizer"
"### Load tokenizer"
]
},
{
Expand All @@ -66,7 +66,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
"### Load data"
]
},
{
Expand Down Expand Up @@ -437,6 +437,7 @@
"metadata": {},
"source": [
"## K-S detector\n",
"\n",
"### Initialize\n",
"\n",
"We proceed to initialize the drift detector. From here on the detector works the same as for other modalities such as images. Please check the [images](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_ks_cifar10.html) example or the [K-S detector documentation](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/ksdrift.html) for more information about each of the possible parameters."
Expand Down Expand Up @@ -895,7 +896,7 @@
"source": [
"## MMD PyTorch detector\n",
"\n",
"## Initialize\n",
"### Initialize\n",
"\n",
"We can run the same detector with *PyTorch* backend for both the preprocessing step and MMD implementation:"
]
Expand Down Expand Up @@ -1609,7 +1610,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.7.10"
}
},
"nbformat": 4,
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def readme():

extras_require = {"examples": ["seaborn>=0.9.0", "nlp>=0.3.0"],
"prophet": ["fbprophet>=0.5, <0.7", "holidays==0.9.11", "pystan<3.0"],
"torch": ["torch>=1.0"]}
"torch": ["torch>=1.7.0"]}

setup(
name="alibi-detect",
Expand Down Expand Up @@ -39,6 +39,7 @@ def readme():
"tensorflow>=2.0.0, <2.6.0",
"tensorflow_probability>=0.8.0, <0.13.0",
"transformers>=4.0.0, <5.0.0",
"dill>=0.3.0, <0.4.0",
"tqdm>=4.28.1, <5.0.0"
],
extras_require=extras_require,
Expand Down

0 comments on commit 65ed410

Please sign in to comment.