Skip to content

Commit

Permalink
Merge pull request #132 from scrapinghub/rerun
Browse files Browse the repository at this point in the history
python -m web_poet.testing rerun
  • Loading branch information
kmike authored Feb 20, 2023
2 parents 9e649c4 + c2b1806 commit a8a6982
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 3 deletions.
54 changes: 53 additions & 1 deletion docs/page-objects/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ that Page Object fully qualified class name. Each fixture is a directory inside
it, that contains data for Page Object inputs and output::

fixtures
└── my_project.po.MyItemPage
└── my_project.pages.MyItemPage
├── test-1
│ ├── inputs
│ ├── HttpClient.exists
Expand Down Expand Up @@ -102,6 +102,8 @@ It's available starting with scrapy-poet 0.8.0.

.. _scrapy-poet: https://github.com/scrapinghub/scrapy-poet

.. _web-poet-testing-pytest:

Running tests
=============

Expand Down Expand Up @@ -132,6 +134,56 @@ In this case there is going to be a single test per fixture: if the result
is not fully correct, the test fails. So, following the previous example,
it'd be 2 tests instead of 14.

Test-Driven Development
=======================

You can follow TDD (Test-Driven Development) approach to develop your
page objects. To do so,

1. Generate a fixture (see :ref:`web-poet-testing-scrapy-poet`).
2. Populate ``output.json`` with the correct expected output.
3. Run the tests (see :ref:`web-poet-testing-pytest`) and update the code
until all tests pass. It's convenient to use web-poet :ref:`fields`,
and implement extraction field-by-field, because you'll be getting
an additional test passing after each field is implemented.

This approach allows a fast feedback loop: there is no need to download page
multiple times, and you have a clear progress indication for your work
(number of failing tests remaining). Also, in the end you get
a regression test, which can be helpful later.

Sometimes it may be awkward to set the correct value in JSON before starting
the development, especially if a value is large or has a complex structure.
For example, this could be the case for e-commerce product description field,
which can be hard to copy-paste from the website, and which may have various
whitespace normalization rules which you need to apply.

In this case, it may be more convenient to implement the extraction first,
and only then populate the ``output.json`` file with the correct value.

You can use ``python -m web-poet.testing rerun <fixture_path>`` command
in this case, to re-run the page object using the inputs saved in a fixture.
This command prints output of the page object, as JSON; you can then copy-paste
relevant parts to the ``output.json`` file. It's also possible to make
the command print only some of the fields. For example, you might run the
following command after implementing extraction for "description" and
"descriptionHtml" fields in ``my_project.pages.MyItemPage``::

python -m web-poet.testing rerun \
fixtures/my_project.pages.MyItemPage/test-1 \
--fields description,descriptionHtml

It may output something like this::

{
"description": "..description of the product..",
"descriptionHtml": "<p>...</p>"
}

If these values look good, you can update
``fixtures/my_project.pages.MyItemPage/test-1/output.json`` file
with these values.

.. _web-poet-testing-frozen_time:

Handling time fields
Expand Down
52 changes: 51 additions & 1 deletion tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from web_poet.exceptions import HttpResponseError
from web_poet.page_inputs.client import _SavedResponseData
from web_poet.testing import Fixture
from web_poet.testing.__main__ import main as cli_main
from web_poet.testing.fixture import INPUT_DIR_NAME, META_FILE_NAME, OUTPUT_FILE_NAME
from web_poet.utils import get_fq_class_name

Expand Down Expand Up @@ -66,7 +67,7 @@ async def to_item(self) -> dict: # noqa: D102

def _save_fixture(pytester, page_cls, page_inputs, expected):
base_dir = pytester.path / "fixtures" / get_fq_class_name(page_cls)
Fixture.save(base_dir, inputs=page_inputs, item=expected)
return Fixture.save(base_dir, inputs=page_inputs, item=expected)


def test_pytest_plugin_pass(pytester, book_list_html_response) -> None:
Expand Down Expand Up @@ -368,3 +369,52 @@ def test_httpclient_exception(pytester, book_list_html_response) -> None:
)
result = pytester.runpytest()
result.assert_outcomes(passed=4)


class MyItemPage3(WebPage):
async def to_item(self) -> dict: # noqa: D102
return {"foo": "bar", "egg": "spam", "hello": "world"}


def test_cli_rerun(pytester, book_list_html_response, capsys) -> None:
fixture = _save_fixture(
pytester,
page_cls=MyItemPage3,
page_inputs=[book_list_html_response],
expected={"foo": "bar2", "egg": "spam", "hello": "world"},
)
cli_main(["rerun", str(fixture.path)])
captured = capsys.readouterr()
assert not captured.err
assert json.loads(captured.out) == {"foo": "bar", "egg": "spam", "hello": "world"}


def test_cli_rerun_fields(pytester, book_list_html_response, capsys) -> None:
fixture = _save_fixture(
pytester,
page_cls=MyItemPage3,
page_inputs=[book_list_html_response],
expected={"foo": "bar2", "egg": "spam", "hello": "world"},
)
cli_main(["rerun", str(fixture.path), "--fields=foo,egg"])
captured = capsys.readouterr()
assert not captured.err
assert json.loads(captured.out) == {"foo": "bar", "egg": "spam"}


def test_cli_rerun_fields_unknown_names(
pytester, book_list_html_response, capsys
) -> None:
fixture = _save_fixture(
pytester,
page_cls=MyItemPage3,
page_inputs=[book_list_html_response],
expected={"foo": "bar2", "egg": "spam", "hello": "world"},
)
cli_main(["rerun", str(fixture.path), "--fields=foo,egg2"])
captured = capsys.readouterr()
assert (
"Unknown field names: ['egg2']. Allowed names are: ['egg', 'foo', 'hello']"
in captured.err
)
assert json.loads(captured.out) == {"foo": "bar"}
51 changes: 51 additions & 0 deletions web_poet/testing/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import sys
from pathlib import Path

from web_poet.testing import Fixture


def rerun(args):
fixture = Fixture(Path(args.fixture_path))
item = fixture.get_output()
if args.fields:
fields = args.fields.split(",")
unknown_fields = sorted(set(fields) - item.keys())
if unknown_fields:
print(
f"Unknown field names: {unknown_fields}. "
f"Allowed names are: {sorted(item.keys())}\n",
file=sys.stderr,
)
item = {field: item[field] for field in fields if field in item}
print(fixture.item_to_json(item))


def main(argv=None):
parser = argparse.ArgumentParser(
prog="python -m web_poet.testing",
description="web-poet testing utilities",
)
subparsers = parser.add_subparsers()
parser_rerun = subparsers.add_parser(
"rerun",
description="Run the page object used in a fixture, print its output "
"as JSON. This is most useful when the page object is changed, "
"and you want to update the test case."
"",
)
parser_rerun.add_argument("fixture_path", type=str, help="Path to a fixture")
parser_rerun.add_argument(
"--fields", "-f", type=str, help="Field names, comma-separated"
)
parser_rerun.set_defaults(func=rerun)

args = parser.parse_args(argv)
if hasattr(args, "func"):
args.func(args)
else:
parser.print_help()


if __name__ == "__main__":
main()
11 changes: 10 additions & 1 deletion web_poet/testing/fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ def get_output(self) -> dict:
self._output_error = e
raise

@classmethod
def item_to_json(cls, item: Any) -> str:
"""Convert an item to a JSON string."""
return json.dumps(ItemAdapter(item).asdict(), ensure_ascii=False, indent=4)

@memoizemethod_noargs
def get_expected_output(self) -> dict:
"""Return the saved output."""
Expand Down Expand Up @@ -222,12 +227,16 @@ def save(
fixture_dir = Path(base_directory, fixture_name)
fixture = cls(fixture_dir)
fixture.input_path.mkdir(parents=True)

serialized_inputs = serialize(inputs)
storage = SerializedDataFileStorage(fixture.input_path)
storage.write(serialized_inputs)

with fixture.output_path.open("w") as f:
json.dump(ItemAdapter(item).asdict(), f, ensure_ascii=False, indent=4)
f.write(cls.item_to_json(item))

if meta:
with fixture.meta_path.open("w") as f:
json.dump(meta, f, ensure_ascii=False, indent=4)

return fixture

0 comments on commit a8a6982

Please sign in to comment.