From 9f4e3e6e7fc001f0c71eaf387905e93c46ef050d Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 27 Jul 2023 20:56:39 +0400 Subject: [PATCH 1/6] Drop Python 3.7 support. --- .github/workflows/tests-ubuntu.yml | 2 +- .github/workflows/tests-windows.yml | 2 +- README.rst | 2 +- setup.py | 1 - tox.ini | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests-ubuntu.yml b/.github/workflows/tests-ubuntu.yml index 7d3363c8..13e4a475 100644 --- a/.github/workflows/tests-ubuntu.yml +++ b/.github/workflows/tests-ubuntu.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] env: - TOXENV: py include: diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml index 4c512eb8..9ee8b260 100644 --- a/.github/workflows/tests-windows.yml +++ b/.github/workflows/tests-windows.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] env: - TOXENV: py include: diff --git a/README.rst b/README.rst index baa45803..eaf4e28e 100644 --- a/README.rst +++ b/README.rst @@ -28,7 +28,7 @@ web-poet .. intro starts -``web-poet`` is a Python 3.7+ implementation of the `page object pattern`_ for +``web-poet`` is a Python 3.8+ implementation of the `page object pattern`_ for web scraping. It enables writing portable, reusable web parsing code. .. _page object pattern: https://martinfowler.com/bliki/PageObject.html diff --git a/setup.py b/setup.py index 83b1c117..326bc0ae 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,6 @@ "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/tox.ini b/tox.ini index eb3747fe..39d75b5c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,docs,types +envlist = py38,py39,py310,py311,mypy,docs,types [pytest] asyncio_mode = strict From 13a201e5dc62fb439dbd53d7aed1749c00c1b34d Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 27 Jul 2023 21:01:10 +0400 Subject: [PATCH 2/6] Bump pre-commit tool versions. --- .flake8 | 42 ++++++++++++++++++++++++++------------- .pre-commit-config.yaml | 6 +++--- tests/test_fields.py | 1 - tests/test_page_inputs.py | 1 - tests/test_requests.py | 2 -- tests/test_rules.py | 1 - web_poet/rules.py | 1 - web_poet/utils.py | 1 - 8 files changed, 31 insertions(+), 24 deletions(-) diff --git a/.flake8 b/.flake8 index 382aae90..12c84152 100644 --- a/.flake8 +++ b/.flake8 @@ -16,22 +16,36 @@ ignore = C408, # To be addressed: - D100, # Missing docstring in public module - D101, # Missing docstring in public class - D103, # Missing docstring in public function - D104, # Missing docstring in public package - D105, # Missing docstring in magic method - D107, # Missing docstring in __init__ - D200, # One-line docstring should fit on one line with quotes - D202, # No blank lines allowed after function docstring - D205, # 1 blank line required between summary line and description - D209, # Multi-line docstring closing quotes should be on a separate line - D400, # First line should end with a period - D401, # First line should be in imperative mood - D402, # First line should not be the function's "signature" + # Missing docstring in public module + D100, + # Missing docstring in public class + D101, + # Missing docstring in public function + D103, + # Missing docstring in public package + D104, + # Missing docstring in magic method + D105, + # Missing docstring in __init__ + D107, + # One-line docstring should fit on one line with quotes + D200, + # No blank lines allowed after function docstring + D202, + # 1 blank line required between summary line and description + D205, + # Multi-line docstring closing quotes should be on a separate line + D209, + # First line should end with a period + D400, + # First line should be in imperative mood + D401, + # First line should not be the function's "signature" + D402, # see https://github.com/PyCQA/flake8-bugbear/issues/278 - B024 # abstract base class without abstract methods + # abstract base class without abstract methods + B024 per-file-ignores = # F401: Ignore "imported but unused" errors in __init__ files, as those diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d0ed9da..1093742d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,13 +4,13 @@ repos: language_version: python3 exclude: ^docs/tutorial-project/ repo: https://github.com/ambv/black - rev: 22.12.0 + rev: 23.7.0 - hooks: - id: isort language_version: python3 exclude: ^docs/tutorial-project/ repo: https://github.com/PyCQA/isort - rev: 5.11.5 + rev: 5.12.0 - hooks: - id: flake8 language_version: python3 @@ -22,4 +22,4 @@ repos: - flake8-docstrings - flake8-string-format repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 6.0.0 diff --git a/tests/test_fields.py b/tests/test_fields.py index e3dd3a95..1a6ccff6 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -456,7 +456,6 @@ def field_foo_cached(self): @pytest.mark.asyncio async def test_field_with_handle_urls() -> None: - page = ProductPage() assert page.name == "name" assert page.price == 12.99 diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 0edbb7e7..318f8ec7 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -231,7 +231,6 @@ def test_http_response_headers_from_bytes_dict() -> None: def test_http_response_headers_from_bytes_dict_err() -> None: - with pytest.raises(ValueError): HttpResponseHeaders.from_bytes_dict({b"Content-Length": [316]}) diff --git a/tests/test_requests.py b/tests/test_requests.py index 596ac169..1f294e06 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -28,7 +28,6 @@ async def async_test(req): @pytest.mark.asyncio async def test_perform_request_from_httpclient(async_mock) -> None: - url = "http://example.com" client = HttpClient() @@ -203,7 +202,6 @@ async def err(): @pytest.mark.asyncio async def test_http_client_batch_execute_with_exception(client_that_errs) -> None: - requests = [ HttpRequest("url-1"), HttpRequest("url-get", method="GET"), diff --git a/tests/test_rules.py b/tests/test_rules.py index 13336ae1..f36f01a7 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -148,7 +148,6 @@ def test_apply_rule_converter_on_pattern() -> None: def test_apply_rule_kwargs_only() -> None: - params = { "use": POTopLevel1, "instead_of": POTopLevelOverriden2, diff --git a/web_poet/rules.py b/web_poet/rules.py index c36338bb..53f3efe0 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -252,7 +252,6 @@ def handle_urls( """ def wrapper(cls): - if overrides is not None: msg = ( "The 'overrides' parameter in @handle_urls is deprecated. " diff --git a/web_poet/utils.py b/web_poet/utils.py index fa8ffa77..4eccc997 100644 --- a/web_poet/utils.py +++ b/web_poet/utils.py @@ -70,7 +70,6 @@ class NewName(SomeClass): """ class DeprecatedClass(new_class.__class__): - deprecated_class = None warned_on_subclass = False From e3aee70913daa295142185bba165d5771481d7e9 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 28 Jul 2023 17:22:45 +0400 Subject: [PATCH 3/6] Update typing. --- tests/test_serialization.py | 4 ++-- web_poet/_typing.py | 9 +-------- web_poet/mixins.py | 10 ++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/tests/test_serialization.py b/tests/test_serialization.py index a77b0902..7fd7764a 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -72,7 +72,7 @@ class ResponseData(Injectable): @attrs.define class MyWebPage(WebPage): - url: ResponseUrl + url_: ResponseUrl params: PageParams data: ResponseData @@ -155,7 +155,7 @@ def _deserialize(t: Type[C], data: SerializedLeafData) -> C: def test_write_data(book_list_html_response, tmp_path) -> None: @attrs.define class MyWebPage(WebPage): - url: ResponseUrl + url_: ResponseUrl url = ResponseUrl("http://example.com") diff --git a/web_poet/_typing.py b/web_poet/_typing.py index 9f2578c3..d7a2c984 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -1,13 +1,6 @@ """Utilities for typing""" import typing -if hasattr(typing, "get_args"): - _get_args = typing.get_args -else: - - def _get_args(base): - return getattr(base, "__args__", ()) - def is_generic_alias(obj) -> bool: for attr_name in ["GenericAlias", "_GenericAlias"]: @@ -20,7 +13,7 @@ def is_generic_alias(obj) -> bool: def get_generic_parameter(cls): for base in getattr(cls, "__orig_bases__", []): if is_generic_alias(base): - args = _get_args(base) + args = typing.get_args(base) return args[0] diff --git a/web_poet/mixins.py b/web_poet/mixins.py index 7c07845d..c353bd20 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -8,6 +8,7 @@ from w3lib.html import get_base_url if TYPE_CHECKING: + from web_poet.page_inputs.http import HttpResponse # pragma: nocover from web_poet.page_inputs.url import RequestUrl, ResponseUrl # pragma: nocover @@ -78,10 +79,6 @@ def urljoin(self, url: Union[str, RequestUrl, ResponseUrl]) -> RequestUrl: return RequestUrl(urljoin(self._base_url, str(url))) -# TODO: when dropping Python 3.7 support, -# fix untyped ResponseShortcutsMixin.response using typing.Protocol - - class ResponseShortcutsMixin(SelectableMixin, UrlShortcutsMixin): """Common shortcut methods for working with HTML responses. This mixin could be used with Page Object base classes. @@ -89,15 +86,16 @@ class ResponseShortcutsMixin(SelectableMixin, UrlShortcutsMixin): It requires "response" attribute to be present. """ + response: HttpResponse _cached_base_url = None @property - def url(self): + def url(self) -> str: """Shortcut to HTML Response's URL, as a string.""" return str(self.response.url) @property - def html(self): + def html(self) -> str: """Shortcut to HTML Response's content.""" return self.response.text From c39a1a7f5dd8af573713ae11a82feff9fb48bb05 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 28 Jul 2023 17:56:19 +0400 Subject: [PATCH 4/6] Don't inherit from WebPage in serialization tests. --- tests/test_serialization.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 7fd7764a..74562557 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -8,6 +8,7 @@ HttpResponse, HttpResponseBody, Injectable, + ItemPage, PageParams, ResponseUrl, WebPage, @@ -24,7 +25,7 @@ ) -def _assert_webpages_equal(p1: WebPage, p2: WebPage) -> None: +def _assert_pages_equal(p1, p2) -> None: assert type(p1) == type(p2) assert type(p1.response) == type(p2.response) # noqa: E721 assert type(p1.response.body) == type(p2.response.body) # noqa: E721 @@ -71,8 +72,9 @@ class ResponseData(Injectable): response: HttpResponse @attrs.define - class MyWebPage(WebPage): - url_: ResponseUrl + class MyWebPage(ItemPage): + response: HttpResponse + url: ResponseUrl params: PageParams data: ResponseData @@ -104,7 +106,7 @@ class MyWebPage(WebPage): book_list_html_response, url, page_params, ResponseData(book_list_html_response) ) deserialized_po = deserialize(MyWebPage, serialized_deps) - _assert_webpages_equal(po, deserialized_po) + _assert_pages_equal(po, deserialized_po) assert deserialized_po.data is not None @@ -154,8 +156,9 @@ def _deserialize(t: Type[C], data: SerializedLeafData) -> C: def test_write_data(book_list_html_response, tmp_path) -> None: @attrs.define - class MyWebPage(WebPage): - url_: ResponseUrl + class MyWebPage(ItemPage): + response: HttpResponse + url: ResponseUrl url = ResponseUrl("http://example.com") @@ -178,7 +181,7 @@ class MyWebPage(WebPage): po = MyWebPage(book_list_html_response, url) deserialized_po = deserialize(MyWebPage, read_serialized_deps) assert type(deserialized_po) == MyWebPage - _assert_webpages_equal(po, deserialized_po) + _assert_pages_equal(po, deserialized_po) def test_extra_files(book_list_html_response, tmp_path) -> None: From 765fec07d86b95100f09003930c07e8cc1ed482d Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 28 Jul 2023 19:18:55 +0400 Subject: [PATCH 5/6] Use a protocol in ResponseShortcutsMixin. --- web_poet/mixins.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/web_poet/mixins.py b/web_poet/mixins.py index c353bd20..8b55b84b 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -1,7 +1,7 @@ from __future__ import annotations import abc -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Protocol, Union from urllib.parse import urljoin import parsel @@ -79,14 +79,17 @@ def urljoin(self, url: Union[str, RequestUrl, ResponseUrl]) -> RequestUrl: return RequestUrl(urljoin(self._base_url, str(url))) -class ResponseShortcutsMixin(SelectableMixin, UrlShortcutsMixin): +class ResponseProtocol(Protocol): + response: HttpResponse + + +class ResponseShortcutsMixin(ResponseProtocol, SelectableMixin, UrlShortcutsMixin): """Common shortcut methods for working with HTML responses. This mixin could be used with Page Object base classes. It requires "response" attribute to be present. """ - response: HttpResponse _cached_base_url = None @property From ffaa3036509651fc4a5801d72dcd744f04db4207 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 28 Jul 2023 20:04:05 +0400 Subject: [PATCH 6/6] Fix tests for the new zyte-common-items metadata handling. --- tests/test_testing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_testing.py b/tests/test_testing.py index c26abe1f..2935ce3d 100644 --- a/tests/test_testing.py +++ b/tests/test_testing.py @@ -220,13 +220,18 @@ class MetadataLocalTime(Metadata): dateDownloadedLocal: Optional[str] = None -def _get_product_item(date: datetime.datetime) -> Product: +@attrs.define(kw_only=True) +class ProductLocalTime(Product): + metadata: Optional[MetadataLocalTime] + + +def _get_product_item(date: datetime.datetime) -> ProductLocalTime: if date.tzinfo is None: # convert to the aware object so that date_local_str always includes the offset date = date.astimezone() date_str = date.astimezone(dateutil.tz.UTC).strftime("%Y-%M-%dT%H:%M:%SZ") date_local_str = date.strftime("%Y-%M-%dT%H:%M:%S%z") - return Product( + return ProductLocalTime( url="http://example.com", name="foo", metadata=MetadataLocalTime(