Update pickle to dill, fix various docs links and tests, update roadm…

…ap (#311) * update pickle to dill for saving and update notebook links * fix tf kernels test, update roadmap, fix links in docs * update pytorch version * remove print * fix indent notebook
SeldonIO · Aug 16, 2021 · 65ed410 · 65ed410
1 parent e965165
commit 65ed410
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 37 deletions.
diff --git a/alibi_detect/utils/pytorch/tests/test_kernels_pt.py b/alibi_detect/utils/pytorch/tests/test_kernels_pt.py
@@ -80,4 +80,4 @@ def test_deep_kernel(deep_kernel_params):
     k_xx = kernel(x, x).detach().numpy()
     assert k_xy.shape == n_instances and k_xx.shape == (xshape[0], xshape[0])
     assert (np.diag(k_xx) > 0.).all()
-    np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=6)
+    np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=5)
diff --git a/alibi_detect/utils/saving.py b/alibi_detect/utils/saving.py
@@ -1,9 +1,9 @@
 # type: ignore
 # TODO: need to rewrite utilities using isinstance or @singledispatch for type checking to work properly
+import dill
 from functools import partial
 import logging
 import os
-import pickle
 import tensorflow as tf
 from tensorflow.keras.layers import Input, InputLayer
 from tensorflow_probability.python.distributions.distribution import Distribution
@@ -95,8 +95,8 @@ def save_detector(detector: Data, filepath: str) -> None:
         os.mkdir(filepath)
 
     # save metadata
-    with open(os.path.join(filepath, 'meta.pickle'), 'wb') as f:
-        pickle.dump(detector.meta, f)
+    with open(os.path.join(filepath, 'meta.dill'), 'wb') as f:
+        dill.dump(detector.meta, f)
 
     # save outlier detector specific parameters
     if detector_name == 'OutlierAE':
@@ -134,8 +134,8 @@ def save_detector(detector: Data, filepath: str) -> None:
     elif detector_name == 'LLR':
         state_dict = state_llr(detector)
 
-    with open(os.path.join(filepath, detector_name + '.pickle'), 'wb') as f:
-        pickle.dump(state_dict, f)
+    with open(os.path.join(filepath, detector_name + '.dill'), 'wb') as f:
+        dill.dump(state_dict, f)
 
     # save outlier detector specific TensorFlow models
     if detector_name == 'OutlierAE':
@@ -197,16 +197,16 @@ def save_embedding(embed: tf.keras.Model,
     if not os.path.isdir(model_dir):
         os.mkdir(model_dir)
     embed.save_pretrained(model_dir)
-    with open(os.path.join(filepath, save_dir, model_name + '.pickle'), 'wb') as f:
-        pickle.dump(embed_args, f)
+    with open(os.path.join(filepath, save_dir, model_name + '.dill'), 'wb') as f:
+        dill.dump(embed_args, f)
 
 
 def preprocess_step_drift(cd: Union[ChiSquareDrift, ClassifierDriftTF, KSDrift, MMDDriftTF, TabularDrift]) \
         -> Tuple[
             Optional[Callable], Dict, Optional[Union[tf.keras.Model, tf.keras.Sequential]],
             Optional[TransformerEmbedding], Dict, Optional[Callable], bool
         ]:
-    # note: need to be able to pickle tokenizers other than transformers
+    # note: need to be able to dill tokenizers other than transformers
     preprocess_fn, preprocess_kwargs = None, {}
     model, embed, embed_args, tokenizer, load_emb = None, None, {}, None, False
     if isinstance(cd.preprocess_fn, partial):
@@ -964,7 +964,7 @@ def load_detector(filepath: str, **kwargs) -> Data:
         raise ValueError('{} does not exist.'.format(filepath))
 
     # load metadata
-    meta_dict = pickle.load(open(os.path.join(filepath, 'meta.pickle'), 'rb'))
+    meta_dict = dill.load(open(os.path.join(filepath, 'meta.dill'), 'rb'))
 
     if 'backend' in list(meta_dict.keys()) and meta_dict['backend'] == 'pytorch':
         raise NotImplementedError('Detectors with PyTorch backend are not yet supported.')
@@ -974,7 +974,7 @@ def load_detector(filepath: str, **kwargs) -> Data:
         raise ValueError('{} is not supported by `load_detector`.'.format(detector_name))
 
     # load outlier detector specific parameters
-    state_dict = pickle.load(open(os.path.join(filepath, detector_name + '.pickle'), 'rb'))
+    state_dict = dill.load(open(os.path.join(filepath, detector_name + '.dill'), 'rb'))
 
     # initialize outlier detector
     if detector_name == 'OutlierAE':
@@ -1456,7 +1456,7 @@ def load_text_embed(filepath: str, load_dir: str = 'model') \
         -> Tuple[TransformerEmbedding, Callable]:
     model_dir = os.path.join(filepath, load_dir)
     tokenizer = AutoTokenizer.from_pretrained(model_dir)
-    args = pickle.load(open(os.path.join(model_dir, 'embedding.pickle'), 'rb'))
+    args = dill.load(open(os.path.join(model_dir, 'embedding.dill'), 'rb'))
     emb = TransformerEmbedding(
         model_dir, embedding_type=args['embedding_type'], layers=args['layers']
     )

diff --git a/alibi_detect/utils/tensorflow/tests/test_kernels_tf.py b/alibi_detect/utils/tensorflow/tests/test_kernels_tf.py
@@ -79,4 +79,4 @@ def test_deep_kernel(deep_kernel_params):
     k_xx = kernel(x, x).numpy()
     assert k_xy.shape == n_instances and k_xx.shape == (xshape[0], xshape[0])
     assert (np.diag(k_xx) > 0.).all()
-    np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=6)
+    np.testing.assert_almost_equal(k_xy, np.transpose(k_yx), decimal=5)
diff --git a/doc/source/overview/getting_started.md b/doc/source/overview/getting_started.md
@@ -100,7 +100,7 @@ And **data drift**:
 
 * [Least-Squares Density Difference Drift Detector](../methods/lsdddrift.ipynb)
 
-* [Maximum Mean Discrepancy Drift Detector](../methods/mmddrift.ipynb)
+* [Maximum Mean Discrepancy (MMD) Drift Detector](../methods/mmddrift.ipynb)
 
 * [Learned Kernel MMD Drift Detector](../methods/learnedkerneldrift.ipynb)
 
@@ -110,13 +110,13 @@ And **data drift**:
 
 * [Classifier Drift Detector](../methods/classifierdrift.ipynb)
 
-* [Spot-the-diff Drift Detector](../methods/classifierdrift.ipynb)
+* [Spot-the-diff Drift Detector](../methods/spotthediffdrift.ipynb)
 
 * [Classifier and Regressor Uncertainty Drift Detectors](../methods/modeluncdrift.ipynb)
 
 * [Online Maximum Mean Discrepancy Drift Detector](../methods/onlinemmddrift.ipynb)
 
-* [Online Least_Squares Density Difference Drift Detector](../methods/onlinelsdddrift.ipynb)
+* [Online Least-Squares Density Difference Drift Detector](../methods/onlinelsdddrift.ipynb)
 
 
 ## Basic Usage

diff --git a/doc/source/overview/roadmap.md b/doc/source/overview/roadmap.md
@@ -12,14 +12,14 @@ The algorithms will cover the following data types:
 * **Images**
 * **Time series**, both univariate and multivariate.
 * **Text**
+* **Graphs**
 
 It will also be possible to combine different algorithms in ensemble detectors.
 
-The library **currently** covers both online and offline outlier detection algorithms for 
-tabular data, images and time series as well as offline adversarial detectors for 
-tabular data and images. Current drift detection capabilities cover mixed type tabular data, text and images.
+The library **currently** covers both online and offline **outlier** detection algorithms for 
+tabular data, images and time series as well as offline **adversarial** detectors for 
+tabular data and images. Current **drift** detection capabilities cover almost any data modality such as mixed type tabular data, 
+text, images or graphs both in the online and offline setting, and with .
 
-The **near term** focus will be on adding online and text drift detectors, extending the PyTorch support, and adding outlier detectors for text and mixed data types.
-
-In the **medium term**, we intend to leverage labels in a semi-supervised setting for the
-detectors and incorporate drift detection for time series.
+The **near term** focus will be on adding more use case specific drift detectors, extending save/load functionality for 
+PyTorch detectors, concept drift detection, and adding outlier detectors for text and mixed data types.
diff --git a/examples/cd_clf_cifar10.ipynb b/examples/cd_clf_cifar10.ipynb
@@ -585,7 +585,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/examples/cd_online_camelyon.ipynb b/examples/cd_online_camelyon.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook demonstrates a typical workflow for applying online drift detectors to streams of image data. For those unfamiliar with how the online drift detectors operate in `alibi_detect` we recommend first checking out the more introductory example [`cd_online_wine.ipynb`](https://github.com/SeldonIO/alibi-detect/blob/master/examples/cd_online_wine.ipynb) where online drift detection is performed for the wine quality dataset."
+    "This notebook demonstrates a typical workflow for applying online drift detectors to streams of image data. For those unfamiliar with how the online drift detectors operate in `alibi_detect` we recommend first checking out the more introductory example [Online Drift Detection on the Wine Quality Dataset](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_online_wine.html) where online drift detection is performed for the wine quality dataset."
    ]
   },
   {
@@ -524,7 +524,7 @@
    "hash": "26d4efd8bf86ae199e0cff801fa58ff781ca69d267a2f4141eff4295422fc53d"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -538,7 +538,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.5"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/examples/cd_spot_the_diff_mnist_wine.ipynb b/examples/cd_spot_the_diff_mnist_wine.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Interpretable drift detection with the spot-the-diff detector on MNIST and Wine-Quality datasets\n",
     "\n",
-    "Under the hood drift detectors leverage a function of the data that is expected to be large when drift has occured and small when it hasn't. In the [`cd_clf_cifar10.ipynb`](https://github.com/SeldonIO/alibi-detect/blob/master/examples/cd_clf_cifar10.ipynb) example notebook we note that we can **learn** a function satisfying this property by training a classifer to distinguish reference and test samples. However we now additionally note that if the classifier is specified in a certain way then when drift is detected we can inspect the weights of the classifier to shine light on exactly which features of the data were used to distinguish reference from test samples and therefore caused drift to be detected. \n",
+    "Under the hood drift detectors leverage a function of the data that is expected to be large when drift has occured and small when it hasn't. In the [Learned drift detectors on CIFAR-10](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_clf_cifar10.html) example notebook we note that we can **learn** a function satisfying this property by training a classifer to distinguish reference and test samples. However we now additionally note that if the classifier is specified in a certain way then when drift is detected we can inspect the weights of the classifier to shine light on exactly which features of the data were used to distinguish reference from test samples and therefore caused drift to be detected. \n",
     "\n",
     "The `SpotTheDiffDrift` detector is designed to make this process straightforward. Like the `ClassifierDrift` detector, it uses a portion of the available data to train a classifier to discriminate between reference and test instances. Letting $\\hat{p}_T(x)$ represent the probability assigned by the classifier that the instance $x$ is from the test set rather than reference set, the difference here is that we use a classifier of the form $$\\text{logit}(\\hat{p}_T) = b_0 + b_1 k(x,w_1) + ... + b_Jk(x,w_J),$$ where $k(\\cdot,\\cdot)$ is a kernel specifying a notion of similarity between instances, $w_i$ are learnable *test locations* and $b_i$ are learnable regression coefficients.\n",
     "\n",
@@ -804,7 +804,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/examples/cd_text_imdb.ipynb b/examples/cd_text_imdb.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Text drift detection on IMDB movie reviews\n",
     "\n",
-    "### Method\n",
+    "## Method\n",
     "\n",
     "We detect drift on text data using both the [Maximum Mean Discrepancy](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/mmddrift.html) and [Kolmogorov-Smirnov (K-S)](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/ksdrift.html) detectors. In this example notebook we will focus on detecting covariate shift $\\Delta p(x)$ as detecting predicted label distribution drift does not differ from other modalities (check [K-S](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_ks_cifar10.html#BBSDs) and [MMD](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_mmd_cifar10.html#BBSDs) drift on CIFAR-10).\n",
     "\n",
@@ -16,12 +16,12 @@
     "\n",
     "The library contains functionality to leverage pre-trained embeddings from [HuggingFace's transformer package](https://github.com/huggingface/transformers) but also allows you to easily use your own embeddings of choice. Both options are illustrated with examples in this notebook.\n",
     "\n",
-    "### Backend\n",
+    "## Backend\n",
     "\n",
     "The method works with both the **PyTorch** and **TensorFlow** frameworks for the statistical tests and preprocessing steps. Alibi Detect does however not install PyTorch for you. \n",
     "Check the [PyTorch docs](https://pytorch.org/) how to do this.\n",
     "\n",
-    "### Dataset\n",
+    "## Dataset\n",
     "\n",
     "Binary sentiment classification [dataset](https://ai.stanford.edu/~amaas/data/sentiment/) containing $25,000$ movie reviews for training and $25,000$ for testing. Install the `nlp` library to fetch the dataset:\n",
     "\n",
@@ -47,7 +47,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load tokenizer"
+    "### Load tokenizer"
    ]
   },
   {
@@ -66,7 +66,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load data"
+    "### Load data"
    ]
   },
   {
@@ -437,6 +437,7 @@
    "metadata": {},
    "source": [
     "## K-S detector\n",
+    "\n",
     "### Initialize\n",
     "\n",
     "We proceed to initialize the drift detector. From here on the detector works the same as for other modalities such as images. Please check the [images](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_ks_cifar10.html) example or the [K-S detector documentation](https://docs.seldon.io/projects/alibi-detect/en/latest/methods/ksdrift.html) for more information about each of the possible parameters."
@@ -895,7 +896,7 @@
    "source": [
     "## MMD PyTorch detector\n",
     "\n",
-    "## Initialize\n",
+    "### Initialize\n",
     "\n",
     "We can run the same detector with *PyTorch* backend for both the preprocessing step and MMD implementation:"
    ]
@@ -1609,7 +1610,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.10"
   }
  },
  "nbformat": 4,

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@ def readme():
 
 extras_require = {"examples": ["seaborn>=0.9.0", "nlp>=0.3.0"],
                   "prophet": ["fbprophet>=0.5, <0.7", "holidays==0.9.11", "pystan<3.0"],
-                  "torch": ["torch>=1.0"]}
+                  "torch": ["torch>=1.7.0"]}
 
 setup(
     name="alibi-detect",
@@ -39,6 +39,7 @@ def readme():
         "tensorflow>=2.0.0, <2.6.0",
         "tensorflow_probability>=0.8.0, <0.13.0",
         "transformers>=4.0.0, <5.0.0",
+        "dill>=0.3.0, <0.4.0",
         "tqdm>=4.28.1, <5.0.0"
     ],
     extras_require=extras_require,