LR intent classifier (#133)

add sparse lr/sklearn intent classifier
RasaHQ · Jun 28, 2021 · 6ec0a71 · 6ec0a71
1 parent e628388
commit 6ec0a71
Show file tree

Hide file tree

Showing 13 changed files with 621 additions and 315 deletions.
diff --git a/docs/docs/classifier/sparselr.md b/docs/docs/classifier/sparselr.md
@@ -0,0 +1,60 @@
+# SparseLogisticRegressionIntentClassifier
+
+This intent classifier is based on the Logistic Regression Classifier from
+[sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
+This classifier only looks at sparse features extracted from the Rasa NLU
+feature pipeline and is a faster alternative to neural models like
+[DIET](https://rasa.com/docs/rasa/components#dietclassifier-2). This model
+requires that there be some sparse featurizers in your pipeleine. If you config
+only has dense features it will throw an exception.
+
+## Configurable Variables
+
+We classifier supports the same parameters as those that are listed in the [sklearn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). The only difference is:
+- there is no `warm_start option`
+- the default `class_weight` is "balanced"
+
+## Base Usage
+
+The configuration file below demonstrates how you might use the this component.
+In this example we are extracting sparse features with two
+CountVectorsFeaturizer instances, the first of which produces sparse
+bag-of-words features, and the second which produces sparse
+bags-of-character-ngram features.
+
+Note that in the following example, setting the `class_weight` parameter to None
+explicitly does have an effect because our default value for this paramter is "balanced".
+
+```yaml
+language: en
+
+pipeline:
+- name: WhitespaceTokenizer
+- name: CountVectorsFeaturizer
+- name: CountVectorsFeaturizer
+  analyzer: char_wb
+  min_ngram: 1
+  max_ngram: 4
+- name: rasa_nlu_examples.classifiers.SparseLogisticRegressionIntentClassifier
+  class_weight: None
+```
+
+Unlike [DIET](https://rasa.com/docs/rasa/components#dietclassifier-2), this
+classifier only predicts intents. If you also need entity extraction, you will
+have to add a separate entity extractor to your config. Below is an example
+where we have included the CRFEntityExtractor to extract entities.
+
+```yaml
+language: en
+
+pipeline:
+- name: WhitespaceTokenizer
+- name: LexicalSyntacticFeaturizer
+- name: CountVectorsFeaturizer
+- name: CountVectorsFeaturizer
+  analyzer: char_wb
+  min_ngram: 1
+  max_ngram: 4
+- name: rasa_nlu_examples.classifiers.SparseLogisticRegressionIntentClassifier
+- name: CRFEntityExtractor
+```
diff --git a/docs/index.md b/docs/index.md
@@ -71,6 +71,7 @@ fairly computationally expensive, especially if you do not need to detect
 entities.  We provide some examples of alternative intent classifiers here.
 
 **`rasa_nlu_examples.classifiers.SparseNaiveBayesIntentClassifier` [docs](docs/classifier/sparsenb.md)**
+**`rasa_nlu_examples.classifiers.SparseLogisticRegressionIntentClassifier` [docs](docs/classifier/sparselr.md)**
 
 ## **Entity Extractors**
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -26,6 +26,7 @@ nav:
         - SemanticMapFeaturizer: docs/featurizer/semantic_map.md
       - Intent Classifiers:
         - SparseNaiveBayes: docs/classifier/sparsenb.md
+        - SparseLRBayes: docs/classifier/sparselr.md
       - Entity Extractors:
         - FlashText: docs/extractors/flashtext.md
         - DateParser: docs/extractors/dateparser.md

diff --git a/rasa_nlu_examples/classifiers/__init__.py b/rasa_nlu_examples/classifiers/__init__.py
@@ -1,3 +1,9 @@
 from .sparse_naive_bayes_intent_classifier import SparseNaiveBayesIntentClassifier
+from .sparse_logistic_regression_intent_classifier import (
+    SparseLogisticRegressionIntentClassifier,
+)
 
-__all__ = ["SparseNaiveBayesIntentClassifier"]
+__all__ = [
+    "SparseNaiveBayesIntentClassifier",
+    "SparseLogisticRegressionIntentClassifier",
+]
diff --git a/rasa_nlu_examples/classifiers/sparse_logistic_regression_intent_classifier.py b/rasa_nlu_examples/classifiers/sparse_logistic_regression_intent_classifier.py
@@ -0,0 +1,42 @@
+from typing import Any
+
+import sklearn
+from sklearn.linear_model import LogisticRegression
+
+from rasa_nlu_examples.classifiers.sparse_sklearn_intent_classifier import (
+    SparseSklearnIntentClassifier,
+)
+
+
+class SparseLogisticRegressionIntentClassifier(SparseSklearnIntentClassifier):
+    r"""A logistic regression classifier using the sklearn framework with sparse features."""
+
+    defaults = {
+        # The following parameters and defaults are the same as the ones used by the
+        # current scikit-learn version (0.24.2). For some nice explanations on what
+        # these parameters and their defaults so, have a look at the scikit-learn docs:
+        # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
+        "C": 1.0,
+        "class_weight": "balanced",
+        "dual": False,
+        "fit_intercept": True,
+        "intercept_scaling": 1,
+        "l1_ratio": None,
+        "max_iter": 100,
+        "multi_class": "auto",
+        "n_jobs": None,
+        "penalty": "l2",
+        "random_state": None,
+        "solver": "lbfgs",
+        "tol": 0.0001,
+        "verbose": 0,
+    }
+
+    def create_sklearn_classifier(self, **kwargs: Any) -> sklearn.base.ClassifierMixin:
+        r"""Lazily imports the required sklearn classifier class and creates and
+        instantiates the sklearn classifier using all the given keyword arguments.
+
+        :param **kwargs: see defaults dictionary
+        """
+
+        return LogisticRegression(**kwargs)
diff --git a/rasa_nlu_examples/classifiers/sparse_naive_bayes_intent_classifier.py b/rasa_nlu_examples/classifiers/sparse_naive_bayes_intent_classifier.py
@@ -1,37 +1,15 @@
-import logging
-import os
-import typing
-import warnings
-from typing import Any, Dict, List, Optional, Text, Tuple, Type
+from typing import Any
 
-import numpy as np
-import scipy.sparse
+import sklearn
+from sklearn.naive_bayes import BernoulliNB
 
-import rasa.shared.utils.io
-import rasa.utils.io as io_utils
-from rasa.shared.constants import DOCS_URL_TRAINING_DATA_NLU
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.featurizers.featurizer import SparseFeaturizer
-from rasa.nlu.components import Component
-from rasa.nlu.classifiers.classifier import IntentClassifier
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.shared.nlu.constants import TEXT
-from rasa.nlu.model import Metadata
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
+from rasa_nlu_examples.classifiers.sparse_sklearn_intent_classifier import (
+    SparseSklearnIntentClassifier,
+)
 
-logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    import sklearn
-
-
-class SparseNaiveBayesIntentClassifier(IntentClassifier):
-    """A naive Bayes intent classifier using the sklearn framework with sparse features."""
-
-    @classmethod
-    def required_components(cls) -> List[Type[Component]]:
-        return [SparseFeaturizer]
+class SparseNaiveBayesIntentClassifier(SparseSklearnIntentClassifier):
+    r"""A naive Bayes intent classifier using the sklearn framework with sparse features."""
 
     defaults = {
         # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
@@ -47,215 +25,10 @@ def required_components(cls) -> List[Type[Component]]:
         "class_prior": None,
     }
 
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        clf: Optional["sklearn.naive_bayes.BernoulliNB"] = None,
-        le: Optional["sklearn.preprocessing.LabelEncoder"] = None,
-    ) -> None:
-        """Construct a new naive Bayes intent classifier using the sklearn framework."""
-        from sklearn.preprocessing import LabelEncoder
-
-        super().__init__(component_config)
-
-        if le is not None:
-            self.le = le
-        else:
-            self.le = LabelEncoder()
-        self.clf = clf
-
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["sklearn"]
-
-    def transform_labels_str2num(self, labels: List[Text]) -> np.ndarray:
-        """
-        Transforms a list of strings into numeric label representation.
-
-        :param labels: List of labels to convert to numeric representation
-        :returns: numpy array of numeric label ids.
-        """
-
-        return self.le.fit_transform(labels)
-
-    def transform_labels_num2str(self, y: np.ndarray) -> np.ndarray:
-        """
-        Transforms a numpy array of numeric label ids into a list of string label ids.
-
-        :param y: array of labels to convert to string representation
-        :returns: an ndarray of label id strings
-        """
-
-        return self.le.inverse_transform(y)
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig] = None,
-        **kwargs: Any,
-    ) -> None:
-        """Train the intent classifier on a data set."""
-
-        from sklearn.naive_bayes import BernoulliNB
-
-        alpha = self.component_config["alpha"]
-        binarize = self.component_config["binarize"]
-        fit_prior = self.component_config["fit_prior"]
-        class_prior = self.component_config["class_prior"]
-
-        self.clf = BernoulliNB(
-            alpha=alpha, binarize=binarize, fit_prior=fit_prior, class_prior=class_prior
-        )
-
-        X, y = self.prepare_data(training_data)
-
-        with warnings.catch_warnings():
-            # sklearn raises lots of
-            # "UndefinedMetricWarning: F - score is ill - defined"
-            # if there are few intent examples, this is needed to prevent it
-            warnings.simplefilter("ignore")
-            self.clf.fit(X, y)
-
-    def prepare_data(
-        self, training_data: TrainingData
-    ) -> Tuple[scipy.sparse.spmatrix, np.ndarray]:
-        """
-        Converts a rasa TrainingData object into a tuple of a sparse feature
-        matrix and a dense vector of labels.
-        """
-
-        labels = [e.get("intent") for e in training_data.intent_examples]
-
-        if len(set(labels)) < 2:
-            rasa.shared.utils.io.raise_warning(
-                "Can not train an intent classifier as there are not "
-                "enough intents. Need at least 2 different intents. "
-                "Skipping training of intent classifier.",
-                docs=DOCS_URL_TRAINING_DATA_NLU,
-            )
-            return
-
-        y = self.transform_labels_str2num(labels)
-        X = scipy.sparse.vstack(
-            [
-                self._get_sentence_features(example)
-                for example in training_data.intent_examples
-            ]
-        )
-
-        return X, y
-
-    @staticmethod
-    def _get_sentence_features(message: Message) -> scipy.sparse.spmatrix:
-        _, dense_sentence_features = message.get_dense_features(TEXT)
-        if dense_sentence_features is not None:
-            rasa.shared.utils.io.raise_warning(
-                "Dense features are being computed but not used in "
-                "the SparseNaiveBayesIntentClassifier."
-            )
-
-        _, sentence_features = message.get_sparse_features(TEXT)
-        if sentence_features is not None:
-            return sentence_features.features
-
-        raise ValueError(
-            "No sparse sentence features present. "
-            "Not able to train sklearn intent classifier."
-        )
+    def create_sklearn_classifier(self, **kwargs: Any) -> sklearn.base.ClassifierMixin:
+        r"""Lazily imports the required sklearn classifier class and creates and
+        instantiates the sklearn classifier using all the given keyword arguments.
 
-    def process(self, message: Message, **kwargs: Any) -> None:
-        """Return the most likely intent and its probability for a message."""
-
-        if not self.clf:
-            # component is either not trained or didn't
-            # receive enough training data
-            intent = None
-            intent_ranking = []
-        else:
-            X = self._get_sentence_features(message)
-            intent_ids, probabilities = self.predict(X)
-            intents = self.transform_labels_num2str(np.ravel(intent_ids))
-            # `predict` returns a matrix as it is supposed
-            # to work for multiple examples as well, hence we need to flatten
-            probabilities = probabilities.flatten()
-            if intents.size > 0 and probabilities.size > 0:
-                ranking = list(zip(list(intents), list(probabilities)))[
-                    :LABEL_RANKING_LENGTH
-                ]
-
-                intent = {"name": intents[0], "confidence": probabilities[0]}
-
-                intent_ranking = [
-                    {"name": intent_name, "confidence": score}
-                    for intent_name, score in ranking
-                ]
-            else:
-                intent = {"name": None, "confidence": 0.0}
-                intent_ranking = []
-
-        message.set("intent", intent, add_to_output=True)
-        message.set("intent_ranking", intent_ranking, add_to_output=True)
-
-    def predict_prob(self, X: scipy.sparse.spmatrix) -> np.ndarray:
+        :param **kwargs: see defaults dictionary
         """
-        Given a bow vector of an input text, predict the intent label.
-
-        Return probabilities for all labels.
-        :param X: bow of input text
-        :return: vector of probabilities containing one entry for each label
-        """
-
-        return self.clf.predict_proba(X)
-
-    def predict(self, X: scipy.sparse.spmatrix) -> Tuple[np.ndarray, np.ndarray]:
-        """
-        Given a bow vector of an input text, predict the corresponding intent.
-
-        Return intents and their probabilities, in decreasing order of likelihood.
-        :param X: bow of input text
-        :return: tuple of first, intent labels and second,
-                 intent probabilities.
-        """
-
-        pred_result = self.predict_prob(X)
-        # sort the probabilities retrieving the indices of
-        # the elements in sorted order
-        sorted_indices = np.fliplr(np.argsort(pred_result, axis=1))
-        return sorted_indices, pred_result[:, sorted_indices]
-
-    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this model into the passed directory."""
-
-        classifier_file_name = file_name + "_classifier.pkl"
-        encoder_file_name = file_name + "_encoder.pkl"
-        if self.clf and self.le:
-            io_utils.json_pickle(
-                os.path.join(model_dir, encoder_file_name), self.le.classes_
-            )
-            io_utils.json_pickle(
-                os.path.join(model_dir, classifier_file_name), self.clf
-            )
-        return {"classifier": classifier_file_name, "encoder": encoder_file_name}
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Optional[Text] = None,
-        model_metadata: Optional[Metadata] = None,
-        cached_component: Optional["SparseNaiveBayesIntentClassifier"] = None,
-        **kwargs: Any,
-    ) -> "SparseNaiveBayesIntentClassifier":
-        from sklearn.preprocessing import LabelEncoder
-
-        classifier_file = os.path.join(model_dir, meta.get("classifier"))
-        encoder_file = os.path.join(model_dir, meta.get("encoder"))
-
-        if os.path.exists(classifier_file):
-            classifier = io_utils.json_unpickle(classifier_file)
-            classes = io_utils.json_unpickle(encoder_file)
-            encoder = LabelEncoder()
-            encoder.classes_ = classes
-            return cls(meta, classifier, encoder)
-        else:
-            return cls(meta)
+        return BernoulliNB(**kwargs)