Improve interoperability with form2request and Scrapy Cloud support (#3)

scrapinghub · Nov 6, 2024 · 978877e · 978877e
1 parent 2e282ad
commit 978877e
Show file tree

Hide file tree

Showing 22 changed files with 675 additions and 264 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,9 +10,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - python-version: "3.8"
+        - python-version: "3.9"
           toxenv: min
-        - python-version: "3.8"
         - python-version: "3.9"
         - python-version: "3.10"
         - python-version: "3.11"

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
 *.pyc
-*.joblib
 *egg-info
 .tox
 build
@@ -9,7 +8,6 @@ dist
 htmlcov
 .coverage.*
 .coverage
-.joblib
 .cache
 .pytest_cache
 docs/_build
diff --git a/docs/api.rst b/docs/api.rst
@@ -3,6 +3,8 @@
 API Reference
 =============
 
+.. autofunction:: formasaurus.build_submission
+
 Classifiers
 -----------
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -14,53 +14,12 @@
 
 import os
 import sys
-from unittest.mock import MagicMock
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, os.path.abspath(".."))
 
-
-class Mock(MagicMock):
-    @classmethod
-    def __getattr__(cls, name):
-        if name == "_mock_methods":
-            raise AttributeError()
-        return Mock()
-
-
-MOCK_MODULES = [
-    "sklearn",
-    "sklearn.metrics",
-    "sklearn.externals",
-    "sklearn.feature_extraction",
-    "sklearn.feature_extraction.text",
-    "sklearn.pipeline",
-    "sklearn.linear_model",
-    "sklearn.svm",
-    "sklearn.model_selection",
-    "sklearn.grid_search",
-    "sklearn.cross_validation",
-    "tqdm",
-    "tabulate",
-    "numpy",
-    "scipy",
-    "scipy.stats",
-    "pycrfsuite",
-    "sklearn_crfsuite",
-    "sklearn_crfsuite.metrics",
-    "sklearn_crfsuite.utils",
-    "lxml",
-    "lxml.html",
-    "lxml.html.clean",
-    "ipywidgets",
-    "IPython",
-    "IPython.display",
-]
-
-sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
-
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
@@ -71,6 +30,7 @@ def __getattr__(cls, name):
 # ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.viewcode",
     "alabaster",
 ]
@@ -350,3 +310,12 @@ def __getattr__(cls, name):
 
 # If true, do not generate a @detailmenu in the "Top" node's menu.
 # texinfo_no_detailmenu = False
+
+
+# -- Intersphinx ----------------------------------------------------------
+
+intersphinx_disabled_reftypes = []
+intersphinx_mapping = {
+    "form2request": ("https://form2request.readthedocs.io/en/latest/", None),
+    "requests": ("https://requests.readthedocs.io/en/latest/", None),
+}
diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -25,6 +25,16 @@ examples. Use "Add New Pages" and "Annotate" IPython notebooks for that.
 
 If you want to improve Formasaurus ML models check :ref:`how-it-works` section.
 
+Generating the built-in model
+-----------------------------
+
+Every time we improve the training data, we should re-train the built-in model:
+
+.. code-block:: sh
+
+    pip install .
+    python utils/build.py
+
 Authors
 -------
 

diff --git a/docs/install.rst b/docs/install.rst
@@ -1,11 +1,11 @@
 Install
 =======
 
-Formasaurus requires Python 3.8+.
+Formasaurus requires Python 3.9+.
 
 To install it, run::
 
-    pip install formasaurus[with_deps]
+    pip install formasaurus
 
 After installation it is convenient to execute the ``formasaurus init``
 command. It ensures all necessary initialization is done. Without it,

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -6,41 +6,46 @@ Basic Usage
 
 Grab some HTML:
 
-    >>> import requests
-    >>> html = requests.get('https://www.github.com/').text
-
-Then use :func:`formasaurus.extract_forms <formasaurus.classifiers.extract_forms>`
-to detect form and field types:
-
-    >>> import formasaurus
-    >>> formasaurus.extract_forms(html)
-    [(<Element form at 0x1150ba0e8>,
-      {'fields': {'q': 'search query'}, 'form': 'search'}),
-     (<Element form at 0x1150ba138>,
-      {'fields': {'user[email]': 'email',
-        'user[login]': 'username',
-        'user[password]': 'password'},
-       'form': 'registration'})]
+>>> import requests
+>>> html = requests.get('https://www.github.com/').text
+
+To build and send an HTML form submission request, use
+:func:`~formasaurus.build_submission`, the :doc:`form2request library
+<form2request:index>`, and an HTTP client like the :doc:`requests library
+<requests:index>`:
+
+>>> import requests
+>>> from form2request import form2request
+>>> from formasaurus import build_submission
+>>> form, data, submit_button = build_submission(html, "search", {"search query": "foo"})
+>>> request_data = form2request(form, data, click=submit_button)
+>>> request = request_data.to_requests()
+>>> requests.send(request)
+<Response [200]>
 
 .. note::
 
     To detect form and field types Formasaurus needs to train prediction
     models on user machine. This is done automatically on first call;
     models are saved to a file and then reused.
 
-:func:`formasaurus.extract_forms <formasaurus.classifiers.extract_forms>`
-returns a list of (form, info) tuples, one tuple for each ``<form>``
-element on a page. ``form`` is a lxml Element for a form,
-``info`` dict contains form and field types.
-
-Only fields which are
+To get data about all detected forms and field types, use
+:func:`~formasaurus.classifiers.extract_forms`:
 
-1. visible to user;
-2. have non-empty ``name`` attribute
+>>> import formasaurus
+>>> formasaurus.extract_forms(html)
+[(<Element form at 0x1150ba0e8>,
+  {'fields': {'q': 'search query'}, 'form': 'search'}),
+  (<Element form at 0x1150ba138>,
+  {'fields': {'user[email]': 'email',
+    'user[login]': 'username',
+    'user[password]': 'password'},
+    'form': 'registration'})]
 
-are returned - other fields usually should be either submitted as-is
-(hidden fields) or not sent to the server at all (fields without
-``name`` attribute).
+Formasaurus only considers fields that are user-visible and have a non-empty
+``name`` attribute. Usually, other fields should be either submitted as is
+(hidden fields) or not sent to the server at all (fields without a ``name``
+attribute).
 
 There are edge cases like fields filled with JS or fields which are made
 invisible using CSS, but all bets are off if page uses JS heavily and all
@@ -109,6 +114,7 @@ In this example the data is loaded from an URL; of course, data may be
 loaded from a local file or from an in-memory object, or you may already
 have the tree loaded (e.g. with Scrapy).
 
+.. _form-types:
 
 Form Types
 ----------
@@ -137,6 +143,8 @@ predictions for the whole dataset.
 
 See also: https://en.wikipedia.org/wiki/Precision_and_recall
 
+.. _field-types:
+
 Field Types
 -----------
 

diff --git a/formasaurus/__init__.py b/formasaurus/__init__.py
@@ -1,3 +1,4 @@
 __version__ = "0.9.0"
 
+from ._form2request import build_submission
 from .classifiers import FormFieldClassifier, classify, classify_proba, extract_forms
diff --git a/formasaurus/_form2request.py b/formasaurus/_form2request.py
@@ -0,0 +1,95 @@
+"""High-level API for easier integration with form2request."""
+
+from __future__ import annotations
+
+from lxml.html import FormElement, HtmlElement
+from parsel import Selector, SelectorList
+
+from .classifiers import extract_forms
+
+
+def build_submission(
+    html: bytes | str | HtmlElement | Selector | SelectorList,
+    form_type: str,
+    fields: dict[str, str] = None,
+    *,
+    min_proba: float = 0.05,
+) -> tuple[FormElement, dict[str, str], HtmlElement | None]:
+    """Return the form, data, and submit button to submit an HTML form.
+
+    *html* is the source HTML response, where the form to submit will be found.
+
+    *form_type* is one of the :ref:`supported form types <form-types>`. The
+    returned form is the one of the specified type with the highest probability
+    and a minimum probability of *min_proba*. If there is no match,
+    :exc:`ValueError` is raised.
+
+    .. note:: A probability is always a :class:`float` in the [0, 1] range.
+
+    *fields* is a dictionary of key-value pairs of data to submit with the
+    form, where keys are :ref:`supported field types <field-types>` instead of
+    actual form field names.
+
+    The resulting tuple contains:
+
+    #.  The matching form.
+
+    #.  A dictionary of data to submit with the form. It is the content of
+        *fields*, with keys replaced by their corresponding form field names.
+        Missing fields are silently dropped. When multiple field names matching
+        a given field type are found, the field name with the highest
+        probability is used.
+
+    #.  The submit button of the form, or ``None`` if no submit button was
+        found. If multiple submit buttons are found, the one with the highest
+        probability is returned.
+
+    You can use the :doc:`form2request library <form2request:index>` to turn
+    the result into an HTTP request:
+
+    >>> form, data, submit_button = build_submission(html, "search", {"search query": "foo"})  # doctest: +SKIP
+    >>> request_data = form2request(form, data, click=submit_button)  # doctest: +SKIP
+    """
+    if isinstance(html, Selector):
+        html = html.root
+    elif isinstance(html, SelectorList):
+        try:
+            html = html[0].root
+        except IndexError:
+            raise ValueError("html is an empty SelectorList")
+    forms = extract_forms(html, proba=True, threshold=min_proba)
+    if not forms:
+        raise ValueError("No form found")
+    form, info = max(forms, key=lambda entry: entry[1]["form"].get(form_type, 0.0))
+    proba = info["form"].get(form_type, 0.0)
+    if proba < min_proba:
+        raise ValueError(
+            f"Best matching form probability is below {min_proba:%}: {proba:%}"
+        )
+
+    data = {}
+    fields = fields or {}
+    for field_type, value in fields.items():
+        matching_fields = [
+            (field_name, proba)
+            for field_name, field_data in info["fields"].items()
+            for _field_type, proba in field_data.items()
+            if _field_type == field_type
+        ]
+        if not matching_fields:
+            continue
+        field_name, _ = max(matching_fields, key=lambda entry: entry[1])
+        data[field_name] = value
+
+    submit_button = None
+    matching_fields = [
+        (field_name, proba)
+        for field_name, field_data in info["fields"].items()
+        for field_type, proba in field_data.items()
+        if field_type == "submit button"
+    ]
+    if matching_fields:
+        field_name, _ = max(matching_fields, key=lambda entry: entry[1])
+        submit_button = form.xpath(f".//*[@name='{field_name}']")[0]
+
+    return form, data, submit_button