From bb04ff27a2df1b8d304aad8997e82bfd7dc99d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Busso?= Date: Sun, 24 Mar 2024 00:27:34 +1300 Subject: [PATCH] test new pipelines --- .gitignore | 1 + scrapework/config.py | 3 +- scrapework/pipelines.py | 4 +-- tests/test_pipelines.py | 70 +++++++---------------------------------- 4 files changed, 15 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 47b2ec7..3872521 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ spidy.py +output.json diff --git a/scrapework/config.py b/scrapework/config.py index 92cdaea..d810974 100644 --- a/scrapework/config.py +++ b/scrapework/config.py @@ -11,7 +11,7 @@ class EnvConfig(BaseModel): @classmethod def create_config(cls): fields = {} - for field_name, field_value in cls.__fields__.items(): + for field_name, field_value in cls.model_fields.items(): if field_name in os.environ: fields[field_name] = os.environ[field_name] else: @@ -29,5 +29,4 @@ def create_config(cls): class PipelineConfig(BaseModel): base_url: str - filename: str diff --git a/scrapework/pipelines.py b/scrapework/pipelines.py index 429fd59..f28c18b 100644 --- a/scrapework/pipelines.py +++ b/scrapework/pipelines.py @@ -3,10 +3,10 @@ from typing import Any, Dict, Iterable, Union import boto3 -from pydantic import Field +from pydantic import BaseModel, Field -class Pipeline(ABC): +class Pipeline(ABC, BaseModel): @abstractmethod def process_items( self, diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index e5bb923..37c8a8d 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1,46 +1,20 @@ import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch from scrapework.config import PipelineConfig -from scrapework.spider import Spider - - -# Create a concrete subclass of Spider for testing purposes -class ConcreteSpider(Spider): - name = "concrete_spider" - - def parse(self): - pass - - -def test_process_items_with_file_backend(): - items = [{"name": "item1"}, {"name": "item2"}] - config = PipelineConfig( - backend=BackendType.FILE, - base_url="https://example.com", - s3_bucket="test-bucket", - filename="test.json", - ) - pipeline = ItemPipeline() - - with patch("builtins.open", MagicMock()) as mock_open: - pipeline.process_items(items, config) - - mock_open.assert_called_once_with("test.json", "w") +from scrapework.pipelines import JsonFilePipeline, S3Pipeline def test_process_items_with_s3_backend(): items = [{"name": "item1"}, {"name": "item2"}] config = PipelineConfig( - backend=BackendType.S3, base_url="https://example.com", - s3_bucket="my-bucket", filename="example.json", ) - pipeline = ItemPipeline() + pipeline = S3Pipeline(s3_bucket="my-bucket") with patch("boto3.client") as mock_s3_client: - pipeline.process_items(items, config) + pipeline.process_items(items, config.filename) mock_s3_client.assert_called_once_with("s3") mock_s3_client.return_value.put_object.assert_called_once_with( @@ -48,36 +22,14 @@ def test_process_items_with_s3_backend(): ) -def test_export_to_json(): +def test_process_items_with_json_file_backend(): items = [{"name": "item1"}, {"name": "item2"}] - config = PipelineConfig( - backend=BackendType.FILE, - base_url="https://example.com", - s3_bucket="my-bucket", - filename="example.json", - ) - pipeline = ItemPipeline() - - with patch("builtins.open", MagicMock()) as mock_open: - pipeline.export_to_json(items, config) - - mock_open.assert_called_once_with("example.json", "w") - + filename = "output.json" + pipeline = JsonFilePipeline() -def test_export_to_s3(): - items = [{"name": "item1"}, {"name": "item2"}] - config = PipelineConfig( - backend=BackendType.S3, - base_url="https://example.com", - s3_bucket="my-bucket", - filename="example.json", - ) - pipeline = ItemPipeline() + pipeline.process_items(items, filename) - with patch("boto3.client") as mock_s3_client: - pipeline.export_to_s3(items, config) + with open(filename, "r") as f: + data = json.load(f) - mock_s3_client.assert_called_once_with("s3") - mock_s3_client.return_value.put_object.assert_called_once_with( - Body=json.dumps(items), Bucket="my-bucket", Key="example.json" - ) + assert data == items