forked from TeamHG-Memex/Formasaurus
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve interoperability with form2request and Scrapy Cloud support (#3)
- Loading branch information
Showing
22 changed files
with
675 additions
and
264 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
API Reference | ||
============= | ||
|
||
.. autofunction:: formasaurus.build_submission | ||
|
||
Classifiers | ||
----------- | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
__version__ = "0.9.0" | ||
|
||
from ._form2request import build_submission | ||
from .classifiers import FormFieldClassifier, classify, classify_proba, extract_forms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
"""High-level API for easier integration with form2request.""" | ||
|
||
from __future__ import annotations | ||
|
||
from lxml.html import FormElement, HtmlElement | ||
from parsel import Selector, SelectorList | ||
|
||
from .classifiers import extract_forms | ||
|
||
|
||
def build_submission( | ||
html: bytes | str | HtmlElement | Selector | SelectorList, | ||
form_type: str, | ||
fields: dict[str, str] = None, | ||
*, | ||
min_proba: float = 0.05, | ||
) -> tuple[FormElement, dict[str, str], HtmlElement | None]: | ||
"""Return the form, data, and submit button to submit an HTML form. | ||
*html* is the source HTML response, where the form to submit will be found. | ||
*form_type* is one of the :ref:`supported form types <form-types>`. The | ||
returned form is the one of the specified type with the highest probability | ||
and a minimum probability of *min_proba*. If there is no match, | ||
:exc:`ValueError` is raised. | ||
.. note:: A probability is always a :class:`float` in the [0, 1] range. | ||
*fields* is a dictionary of key-value pairs of data to submit with the | ||
form, where keys are :ref:`supported field types <field-types>` instead of | ||
actual form field names. | ||
The resulting tuple contains: | ||
#. The matching form. | ||
#. A dictionary of data to submit with the form. It is the content of | ||
*fields*, with keys replaced by their corresponding form field names. | ||
Missing fields are silently dropped. When multiple field names matching | ||
a given field type are found, the field name with the highest | ||
probability is used. | ||
#. The submit button of the form, or ``None`` if no submit button was | ||
found. If multiple submit buttons are found, the one with the highest | ||
probability is returned. | ||
You can use the :doc:`form2request library <form2request:index>` to turn | ||
the result into an HTTP request: | ||
>>> form, data, submit_button = build_submission(html, "search", {"search query": "foo"}) # doctest: +SKIP | ||
>>> request_data = form2request(form, data, click=submit_button) # doctest: +SKIP | ||
""" | ||
if isinstance(html, Selector): | ||
html = html.root | ||
elif isinstance(html, SelectorList): | ||
try: | ||
html = html[0].root | ||
except IndexError: | ||
raise ValueError("html is an empty SelectorList") | ||
forms = extract_forms(html, proba=True, threshold=min_proba) | ||
if not forms: | ||
raise ValueError("No form found") | ||
form, info = max(forms, key=lambda entry: entry[1]["form"].get(form_type, 0.0)) | ||
proba = info["form"].get(form_type, 0.0) | ||
if proba < min_proba: | ||
raise ValueError( | ||
f"Best matching form probability is below {min_proba:%}: {proba:%}" | ||
) | ||
|
||
data = {} | ||
fields = fields or {} | ||
for field_type, value in fields.items(): | ||
matching_fields = [ | ||
(field_name, proba) | ||
for field_name, field_data in info["fields"].items() | ||
for _field_type, proba in field_data.items() | ||
if _field_type == field_type | ||
] | ||
if not matching_fields: | ||
continue | ||
field_name, _ = max(matching_fields, key=lambda entry: entry[1]) | ||
data[field_name] = value | ||
|
||
submit_button = None | ||
matching_fields = [ | ||
(field_name, proba) | ||
for field_name, field_data in info["fields"].items() | ||
for field_type, proba in field_data.items() | ||
if field_type == "submit button" | ||
] | ||
if matching_fields: | ||
field_name, _ = max(matching_fields, key=lambda entry: entry[1]) | ||
submit_button = form.xpath(f".//*[@name='{field_name}']")[0] | ||
|
||
return form, data, submit_button |
Oops, something went wrong.