Skip to content

Commit

Permalink
Merge branch 'main' of ssh://github.com/zytedata/zyte-spider-template…
Browse files Browse the repository at this point in the history
…s into fix-dupe-requests
  • Loading branch information
BurnzZ committed Feb 2, 2024
2 parents 63f408f + 14a6ffa commit e066993
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 103 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: '3.8'
toxenv: min
- python-version: '3.8'
- python-version: '3.9'
- python-version: '3.10'
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
install_requires=[
"pydantic>=2",
"scrapy>=2.11.0",
# "scrapy-poet>=0.16.0",
# "scrapy-poet>=0.20.1",
"scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
"scrapy-spider-metadata>=0.1.2",
# "scrapy-zyte-api[provider]>=0.12.2",
# "scrapy-zyte-api[provider]>=0.15.0",
"scrapy-zyte-api @ git+https://[email protected]/scrapy-plugins/scrapy-zyte-api@http-or-browser-response#egg=scrapy-zyte-api",
"zyte-common-items>=0.13.0",
],
Expand Down
7 changes: 5 additions & 2 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,8 +447,11 @@ def test_metadata():
"url": {
"type": "string",
"title": "URL",
"description": "Initial URL for the crawl.",
"pattern": r"^https?:\/\/[^:\/\s]+(:\d{1,5})?(\/[^\s]*)*(#[^\s]*)?$",
"description": (
"Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/"
),
"pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
},
},
"required": ["url"],
Expand Down
195 changes: 115 additions & 80 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from scrapy import Spider
from scrapy.http import Request, Response
from scrapy.statscollectors import StatsCollector
from scrapy.utils.misc import create_instance
from scrapy.utils.test import get_crawler

from zyte_spider_templates.middlewares import (
Expand All @@ -11,14 +12,24 @@
)


def get_fingerprinter(crawler):
return lambda request: crawler.request_fingerprinter.fingerprint(request).hex()


@freeze_time("2023-10-10 20:09:29")
def test_crawling_logs_middleware_no_requests():
middleware = CrawlingLogsMiddleware()
crawler = get_crawler()
middleware = create_instance(
CrawlingLogsMiddleware, settings=crawler.settings, crawler=crawler
)

url = "https://example.com"
request = Request(url)
response = Response(url=url, request=request)

request_fingerprint = get_fingerprinter(crawler)
fingerprint = request_fingerprint(request)

def results_gen():
return

Expand All @@ -38,7 +49,7 @@ def results_gen():
' "current": {\n'
' "url": "https://example.com",\n'
' "request_url": "https://example.com",\n'
' "request_fingerprint": "6d748741a927b10454c83ac285b002cd239964ea",\n'
f' "request_fingerprint": "{fingerprint}",\n'
' "page_type": null,\n'
' "probability": null\n'
" },\n"
Expand All @@ -56,88 +67,98 @@ def results_gen():

@freeze_time("2023-10-10 20:09:29")
def test_crawling_logs_middleware():
middleware = CrawlingLogsMiddleware()
crawler = get_crawler()
middleware = create_instance(
CrawlingLogsMiddleware, settings=crawler.settings, crawler=crawler
)

url = "https://example.com"
request = Request(url)
response = Response(url=url, request=request)

def results_gen():
# product
yield Request(
"https://example.com/tech/products?id=1",
priority=199,
meta={
"crawling_logs": {
"name": "Product ID 1",
"probability": 0.9951,
"page_type": "product",
},
product_request = Request(
"https://example.com/tech/products?id=1",
priority=199,
meta={
"crawling_logs": {
"name": "Product ID 1",
"probability": 0.9951,
"page_type": "product",
},
)

# nextPage
yield Request(
"https://example.com?page=2",
priority=100,
meta={
"crawling_logs": {
"name": "Category Page 2",
"probability": 0.9712,
"page_type": "nextPage",
},
},
)
next_page_request = Request(
"https://example.com?page=2",
priority=100,
meta={
"crawling_logs": {
"name": "Category Page 2",
"probability": 0.9712,
"page_type": "nextPage",
},
)

# subCategories
yield Request(
"https://example.com/tech/products/monitors",
priority=98,
meta={
"crawling_logs": {
"name": "Monitors Subcategory",
"probability": 0.9817,
"page_type": "subCategories",
},
},
)
subcategory_request = Request(
"https://example.com/tech/products/monitors",
priority=98,
meta={
"crawling_logs": {
"name": "Monitors Subcategory",
"probability": 0.9817,
"page_type": "subCategories",
},
)

# productNavigation
yield Request(
"https://example.com/books/products",
priority=91,
meta={
"crawling_logs": {
"name": "Books Category",
"probability": 0.9136,
"page_type": "productNavigation",
},
},
)
product_navigation_request = Request(
"https://example.com/books/products",
priority=91,
meta={
"crawling_logs": {
"name": "Books Category",
"probability": 0.9136,
"page_type": "productNavigation",
},
)

# productNavigation-heuristics
yield Request(
"https://example.com/some-other-page",
priority=10,
meta={
"crawling_logs": {
"name": "Some Other Page",
"probability": 0.1,
"page_type": "productNavigation-heuristics",
},
},
)
product_navigation_heuristics_request = Request(
"https://example.com/some-other-page",
priority=10,
meta={
"crawling_logs": {
"name": "Some Other Page",
"probability": 0.1,
"page_type": "productNavigation-heuristics",
},
)

# unknown
yield Request(
"https://example.com/other-unknown",
meta={
"crawling_logs": {
"name": "Unknown Page",
"page_type": "some other page_type",
},
},
)
unknown_request = Request(
"https://example.com/other-unknown",
meta={
"crawling_logs": {
"name": "Unknown Page",
"page_type": "some other page_type",
},
)
},
)

request_fingerprint = get_fingerprinter(crawler)
fingerprint = request_fingerprint(request)
product_request_fp = request_fingerprint(product_request)
next_page_request_fp = request_fingerprint(next_page_request)
subcategory_request_fp = request_fingerprint(subcategory_request)
product_navigation_request_fp = request_fingerprint(product_navigation_request)
product_navigation_heuristics_request_fp = request_fingerprint(
product_navigation_heuristics_request
)
unknown_request_fp = request_fingerprint(unknown_request)

def results_gen():
yield product_request
yield next_page_request
yield subcategory_request
yield product_navigation_request
yield product_navigation_heuristics_request
yield unknown_request

crawl_logs = middleware.crawl_logs(response, results_gen())
assert crawl_logs == (
Expand All @@ -155,7 +176,7 @@ def results_gen():
' "current": {\n'
' "url": "https://example.com",\n'
' "request_url": "https://example.com",\n'
' "request_fingerprint": "6d748741a927b10454c83ac285b002cd239964ea",\n'
f' "request_fingerprint": "{fingerprint}",\n'
' "page_type": null,\n'
' "probability": null\n'
" },\n"
Expand All @@ -167,7 +188,7 @@ def results_gen():
' "page_type": "product",\n'
' "request_url": "https://example.com/tech/products?id=1",\n'
' "request_priority": 199,\n'
' "request_fingerprint": "3ae14329c7fd5796ab543d7b02cdb7c7c2af3895"\n'
f' "request_fingerprint": "{product_request_fp}"\n'
" }\n"
" ],\n"
' "nextPage": [\n'
Expand All @@ -177,7 +198,7 @@ def results_gen():
' "page_type": "nextPage",\n'
' "request_url": "https://example.com?page=2",\n'
' "request_priority": 100,\n'
' "request_fingerprint": "cf9e7c91564b16c204cdfa8fe3b4d7cb49375a2a"\n'
f' "request_fingerprint": "{next_page_request_fp}"\n'
" }\n"
" ],\n"
' "subCategories": [\n'
Expand All @@ -187,7 +208,7 @@ def results_gen():
' "page_type": "subCategories",\n'
' "request_url": "https://example.com/tech/products/monitors",\n'
' "request_priority": 98,\n'
' "request_fingerprint": "107253243fb9bc9c679808c6c5d80bde5ae7ffe0"\n'
f' "request_fingerprint": "{subcategory_request_fp}"\n'
" }\n"
" ],\n"
' "productNavigation": [\n'
Expand All @@ -197,7 +218,7 @@ def results_gen():
' "page_type": "productNavigation",\n'
' "request_url": "https://example.com/books/products",\n'
' "request_priority": 91,\n'
' "request_fingerprint": "e672605f85de9b9fe76e55463e5bd8ca66ae1ee2"\n'
f' "request_fingerprint": "{product_navigation_request_fp}"\n'
" }\n"
" ],\n"
' "productNavigation-heuristics": [\n'
Expand All @@ -207,7 +228,7 @@ def results_gen():
' "page_type": "productNavigation-heuristics",\n'
' "request_url": "https://example.com/some-other-page",\n'
' "request_priority": 10,\n'
' "request_fingerprint": "a04e46e1d9887a9f397d97c40db63a7ce3c3f958"\n'
f' "request_fingerprint": "{product_navigation_heuristics_request_fp}"\n'
" }\n"
" ],\n"
' "unknown": [\n'
Expand All @@ -216,14 +237,28 @@ def results_gen():
' "page_type": "some other page_type",\n'
' "request_url": "https://example.com/other-unknown",\n'
' "request_priority": 0,\n'
' "request_fingerprint": "61fb82880551b45981b0a1cc52eb802166b673ed"\n'
f' "request_fingerprint": "{unknown_request_fp}"\n'
" }\n"
" ]\n"
" }\n"
"}"
)


def test_crawling_logs_middleware_deprecated_subclassing():
class CustomCrawlingLogsMiddleware(CrawlingLogsMiddleware):
def __init__(self):
pass

crawler = get_crawler()
with pytest.warns(DeprecationWarning, match="must now accept a crawler parameter"):
middleware = create_instance(
CustomCrawlingLogsMiddleware, settings=crawler.settings, crawler=crawler
)
assert middleware._crawler == crawler
assert hasattr(middleware, "_fingerprint")


@pytest.mark.parametrize(
"req,allowed",
(
Expand Down
13 changes: 12 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py38,py39,py310,py311,py312,mypy,linters,twine
envlist = min,py38,py39,py310,py311,py312,mypy,linters,twine

[testenv]
deps =
Expand All @@ -16,6 +16,17 @@ commands =
--reactor=asyncio \
{posargs:zyte_spider_templates tests}

[testenv:min]
basepython = python3.8
deps =
{[testenv]deps}
pydantic==2
scrapy==2.11.0
scrapy-poet==0.20.1
scrapy-spider-metadata==0.1.2
scrapy-zyte-api[provider]==0.15.0
zyte-common-items==0.13.0

[testenv:mypy]
deps =
mypy==1.6.1
Expand Down
Loading

0 comments on commit e066993

Please sign in to comment.