From 1cc435229dd765fd0a4ff4502ea5c33e8ce9db9b Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 29 Sep 2023 21:49:51 +0400 Subject: [PATCH] scrapy-spider-metadata support. (#75) * scrapy-spider-metadata support. * Fixes. * get_metadata_for_spider was renamed. * Add tests for shub-image-info. * Fix tests without scrapy-spider-metadata. * Remove scraoy-spider-metadata from requirements. * Cleanup. --- sh_scrapy/commands/shub_image_info.py | 17 ++++++- tests/test_crawl.py | 66 ++++++++++++++++++++++++++- tests/utils.py | 36 +++++++++++++++ tox.ini | 2 + 4 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 tests/utils.py diff --git a/sh_scrapy/commands/shub_image_info.py b/sh_scrapy/commands/shub_image_info.py index 844cfb2..70bd3db 100644 --- a/sh_scrapy/commands/shub_image_info.py +++ b/sh_scrapy/commands/shub_image_info.py @@ -35,8 +35,23 @@ def add_options(self, parser): def run(self, args, opts): result = { 'project_type': 'scrapy', - 'spiders': sorted(self.crawler_process.spider_loader.list()) + 'spiders': sorted(self.crawler_process.spider_loader.list()), } + try: + from scrapy_spider_metadata import get_spider_metadata + except ImportError: + pass + else: + result['metadata'] = {} + for spider_name in result['spiders']: + spider_cls = self.crawler_process.spider_loader.load(spider_name) + metadata_dict = get_spider_metadata(spider_cls) + try: + # make sure it's serializable + json.dumps(metadata_dict) + except (TypeError, ValueError): + continue + result['metadata'][spider_name] = metadata_dict if opts.debug: output = subprocess.check_output( ['bash', '-c', self.IMAGE_INFO_CMD], diff --git a/tests/test_crawl.py b/tests/test_crawl.py index e382b52..329e4c6 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -3,9 +3,7 @@ import json import mock import pytest -import warnings from scrapy.settings import Settings -from scrapy.exceptions import ScrapyDeprecationWarning import sh_scrapy.crawl from sh_scrapy.crawl import _fatalerror @@ -18,6 +16,14 @@ from sh_scrapy.crawl import list_spiders from sh_scrapy.crawl import main from sh_scrapy.log import HubstorageLogHandler +from tests.utils import create_project, call_command + + +try: + from scrapy_spider_metadata import get_spider_metadata + SPIDER_METADATA_AVAILABLE = True +except: + SPIDER_METADATA_AVAILABLE = False @mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn', @@ -281,3 +287,59 @@ def test_main(mocked_launch, pipe_writer): # This ensures that pipe is writable even if main program is fininshed - # e.g. for threads that are not closed yet. assert not pipe_writer.close.called + + +def test_image_info(tmp_path): + project_dir = create_project(tmp_path) + out, err = call_command(project_dir, "shub-image-info") + # can't be asserted as it contains a SHScrapyDeprecationWarning + # assert err == "" + data = json.loads(out) + expected = { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {"myspider": {}}, + } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] + assert data == expected + + +def test_image_info_metadata(tmp_path): + project_dir = create_project(tmp_path, spider_text=""" +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" + metadata = {"foo": 42} +""") + out, _ = call_command(project_dir, "shub-image-info") + data = json.loads(out) + expected = { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {"myspider": {"foo": 42}}, + } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] + assert data == expected + + +def test_image_info_metadata_skip_broken(tmp_path): + project_dir = create_project(tmp_path, spider_text=""" +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" + metadata = {"foo": Spider} +""") + out, _ = call_command(project_dir, "shub-image-info") + data = json.loads(out) + expected = { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {}, + } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] + assert data == expected diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..8e320dd --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,36 @@ +import os +import subprocess +import sys +from pathlib import Path +from typing import Tuple, Optional, Union + + +def call_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: + result = subprocess.run( + args, + cwd=str(cwd), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + assert result.returncode == 0, result.stderr + return result.stdout, result.stderr + + +def call_scrapy_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: + args = (sys.executable, "-m", "scrapy.cmdline") + args + return call_command(cwd, *args) + + +def create_project(topdir: Path, spider_text: Optional[str] = None) -> Path: + project_name = "foo" + cwd = topdir + call_scrapy_command(str(cwd), "startproject", project_name) + cwd /= project_name + (cwd / project_name / "spiders" / "spider.py").write_text(spider_text or """ +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" +""") + return cwd diff --git a/tox.ini b/tox.ini index 49682e8..c1b6641 100644 --- a/tox.ini +++ b/tox.ini @@ -10,5 +10,7 @@ deps = hubstorage packaging py36-scrapy16: Scrapy==1.6 + scrapy-spider-metadata; python_version >= "3.8" + commands = pytest --verbose --cov=sh_scrapy --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: sh_scrapy tests}