Skip to content

Commit

Permalink
Improve interoperability with form2request and Scrapy Cloud support (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 6, 2024
1 parent 2e282ad commit 978877e
Show file tree
Hide file tree
Showing 22 changed files with 675 additions and 264 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: "3.8"
- python-version: "3.9"
toxenv: min
- python-version: "3.8"
- python-version: "3.9"
- python-version: "3.10"
- python-version: "3.11"
Expand Down
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
*.pyc
*.joblib
*egg-info
.tox
build
Expand All @@ -9,7 +8,6 @@ dist
htmlcov
.coverage.*
.coverage
.joblib
.cache
.pytest_cache
docs/_build
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
API Reference
=============

.. autofunction:: formasaurus.build_submission

Classifiers
-----------

Expand Down
51 changes: 10 additions & 41 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,12 @@

import os
import sys
from unittest.mock import MagicMock

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath(".."))


class Mock(MagicMock):
@classmethod
def __getattr__(cls, name):
if name == "_mock_methods":
raise AttributeError()
return Mock()


MOCK_MODULES = [
"sklearn",
"sklearn.metrics",
"sklearn.externals",
"sklearn.feature_extraction",
"sklearn.feature_extraction.text",
"sklearn.pipeline",
"sklearn.linear_model",
"sklearn.svm",
"sklearn.model_selection",
"sklearn.grid_search",
"sklearn.cross_validation",
"tqdm",
"tabulate",
"numpy",
"scipy",
"scipy.stats",
"pycrfsuite",
"sklearn_crfsuite",
"sklearn_crfsuite.metrics",
"sklearn_crfsuite.utils",
"lxml",
"lxml.html",
"lxml.html.clean",
"ipywidgets",
"IPython",
"IPython.display",
]

sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
Expand All @@ -71,6 +30,7 @@ def __getattr__(cls, name):
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
"alabaster",
]
Expand Down Expand Up @@ -350,3 +310,12 @@ def __getattr__(cls, name):

# If true, do not generate a @detailmenu in the "Top" node's menu.
# texinfo_no_detailmenu = False


# -- Intersphinx ----------------------------------------------------------

intersphinx_disabled_reftypes = []
intersphinx_mapping = {
"form2request": ("https://form2request.readthedocs.io/en/latest/", None),
"requests": ("https://requests.readthedocs.io/en/latest/", None),
}
10 changes: 10 additions & 0 deletions docs/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ examples. Use "Add New Pages" and "Annotate" IPython notebooks for that.

If you want to improve Formasaurus ML models check :ref:`how-it-works` section.

Generating the built-in model
-----------------------------

Every time we improve the training data, we should re-train the built-in model:

.. code-block:: sh
pip install .
python utils/build.py
Authors
-------

Expand Down
4 changes: 2 additions & 2 deletions docs/install.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Install
=======

Formasaurus requires Python 3.8+.
Formasaurus requires Python 3.9+.

To install it, run::

pip install formasaurus[with_deps]
pip install formasaurus

After installation it is convenient to execute the ``formasaurus init``
command. It ensures all necessary initialization is done. Without it,
Expand Down
60 changes: 34 additions & 26 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,46 @@ Basic Usage

Grab some HTML:

>>> import requests
>>> html = requests.get('https://www.github.com/').text

Then use :func:`formasaurus.extract_forms <formasaurus.classifiers.extract_forms>`
to detect form and field types:

>>> import formasaurus
>>> formasaurus.extract_forms(html)
[(<Element form at 0x1150ba0e8>,
{'fields': {'q': 'search query'}, 'form': 'search'}),
(<Element form at 0x1150ba138>,
{'fields': {'user[email]': 'email',
'user[login]': 'username',
'user[password]': 'password'},
'form': 'registration'})]
>>> import requests
>>> html = requests.get('https://www.github.com/').text

To build and send an HTML form submission request, use
:func:`~formasaurus.build_submission`, the :doc:`form2request library
<form2request:index>`, and an HTTP client like the :doc:`requests library
<requests:index>`:

>>> import requests
>>> from form2request import form2request
>>> from formasaurus import build_submission
>>> form, data, submit_button = build_submission(html, "search", {"search query": "foo"})
>>> request_data = form2request(form, data, click=submit_button)
>>> request = request_data.to_requests()
>>> requests.send(request)
<Response [200]>

.. note::

To detect form and field types Formasaurus needs to train prediction
models on user machine. This is done automatically on first call;
models are saved to a file and then reused.

:func:`formasaurus.extract_forms <formasaurus.classifiers.extract_forms>`
returns a list of (form, info) tuples, one tuple for each ``<form>``
element on a page. ``form`` is a lxml Element for a form,
``info`` dict contains form and field types.

Only fields which are
To get data about all detected forms and field types, use
:func:`~formasaurus.classifiers.extract_forms`:

1. visible to user;
2. have non-empty ``name`` attribute
>>> import formasaurus
>>> formasaurus.extract_forms(html)
[(<Element form at 0x1150ba0e8>,
{'fields': {'q': 'search query'}, 'form': 'search'}),
(<Element form at 0x1150ba138>,
{'fields': {'user[email]': 'email',
'user[login]': 'username',
'user[password]': 'password'},
'form': 'registration'})]

are returned - other fields usually should be either submitted as-is
(hidden fields) or not sent to the server at all (fields without
``name`` attribute).
Formasaurus only considers fields that are user-visible and have a non-empty
``name`` attribute. Usually, other fields should be either submitted as is
(hidden fields) or not sent to the server at all (fields without a ``name``
attribute).

There are edge cases like fields filled with JS or fields which are made
invisible using CSS, but all bets are off if page uses JS heavily and all
Expand Down Expand Up @@ -109,6 +114,7 @@ In this example the data is loaded from an URL; of course, data may be
loaded from a local file or from an in-memory object, or you may already
have the tree loaded (e.g. with Scrapy).

.. _form-types:

Form Types
----------
Expand Down Expand Up @@ -137,6 +143,8 @@ predictions for the whole dataset.

See also: https://en.wikipedia.org/wiki/Precision_and_recall

.. _field-types:

Field Types
-----------

Expand Down
1 change: 1 addition & 0 deletions formasaurus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
__version__ = "0.9.0"

from ._form2request import build_submission
from .classifiers import FormFieldClassifier, classify, classify_proba, extract_forms
95 changes: 95 additions & 0 deletions formasaurus/_form2request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""High-level API for easier integration with form2request."""

from __future__ import annotations

from lxml.html import FormElement, HtmlElement
from parsel import Selector, SelectorList

from .classifiers import extract_forms


def build_submission(
html: bytes | str | HtmlElement | Selector | SelectorList,
form_type: str,
fields: dict[str, str] = None,
*,
min_proba: float = 0.05,
) -> tuple[FormElement, dict[str, str], HtmlElement | None]:
"""Return the form, data, and submit button to submit an HTML form.
*html* is the source HTML response, where the form to submit will be found.
*form_type* is one of the :ref:`supported form types <form-types>`. The
returned form is the one of the specified type with the highest probability
and a minimum probability of *min_proba*. If there is no match,
:exc:`ValueError` is raised.
.. note:: A probability is always a :class:`float` in the [0, 1] range.
*fields* is a dictionary of key-value pairs of data to submit with the
form, where keys are :ref:`supported field types <field-types>` instead of
actual form field names.
The resulting tuple contains:
#. The matching form.
#. A dictionary of data to submit with the form. It is the content of
*fields*, with keys replaced by their corresponding form field names.
Missing fields are silently dropped. When multiple field names matching
a given field type are found, the field name with the highest
probability is used.
#. The submit button of the form, or ``None`` if no submit button was
found. If multiple submit buttons are found, the one with the highest
probability is returned.
You can use the :doc:`form2request library <form2request:index>` to turn
the result into an HTTP request:
>>> form, data, submit_button = build_submission(html, "search", {"search query": "foo"}) # doctest: +SKIP
>>> request_data = form2request(form, data, click=submit_button) # doctest: +SKIP
"""
if isinstance(html, Selector):
html = html.root
elif isinstance(html, SelectorList):
try:
html = html[0].root
except IndexError:
raise ValueError("html is an empty SelectorList")
forms = extract_forms(html, proba=True, threshold=min_proba)
if not forms:
raise ValueError("No form found")
form, info = max(forms, key=lambda entry: entry[1]["form"].get(form_type, 0.0))
proba = info["form"].get(form_type, 0.0)
if proba < min_proba:
raise ValueError(
f"Best matching form probability is below {min_proba:%}: {proba:%}"
)

data = {}
fields = fields or {}
for field_type, value in fields.items():
matching_fields = [
(field_name, proba)
for field_name, field_data in info["fields"].items()
for _field_type, proba in field_data.items()
if _field_type == field_type
]
if not matching_fields:
continue
field_name, _ = max(matching_fields, key=lambda entry: entry[1])
data[field_name] = value

submit_button = None
matching_fields = [
(field_name, proba)
for field_name, field_data in info["fields"].items()
for field_type, proba in field_data.items()
if field_type == "submit button"
]
if matching_fields:
field_name, _ = max(matching_fields, key=lambda entry: entry[1])
submit_button = form.xpath(f".//*[@name='{field_name}']")[0]

return form, data, submit_button
Loading

0 comments on commit 978877e

Please sign in to comment.