From ee6aa4fb06d2f6c747d6267a55b3b0a856b88319 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 4 Apr 2024 13:50:43 +0500 Subject: [PATCH 1/8] added filter based on close reason --- spidermon/contrib/scrapy/monitors/monitors.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/spidermon/contrib/scrapy/monitors/monitors.py b/spidermon/contrib/scrapy/monitors/monitors.py index b677803c..e01610fc 100644 --- a/spidermon/contrib/scrapy/monitors/monitors.py +++ b/spidermon/contrib/scrapy/monitors/monitors.py @@ -21,6 +21,7 @@ SPIDERMON_JOBS_COMPARISON = "SPIDERMON_JOBS_COMPARISON" SPIDERMON_JOBS_COMPARISON_STATES = "SPIDERMON_JOBS_COMPARISON_STATES" SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS" +SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS" SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD" SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE" @@ -523,6 +524,10 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor): You can also filter which jobs to compare based on their tags using the ``SPIDERMON_JOBS_COMPARISON_TAGS`` setting. Among the defined tags we consider only those that are also present in the current job. + + You can also filter which jobs to compare based on their close reason using the + ``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``, + which doesn't filter any job based on close_reason. """ stat_name = "item_scraped_count" @@ -551,6 +556,9 @@ def run(self, result): def _get_jobs(self, states, number_of_jobs): tags = self._get_tags_to_filter() + close_reason = self.crawler.settings.getlist( + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, () + ) jobs = [] start = 0 @@ -563,7 +571,13 @@ def _get_jobs(self, states, number_of_jobs): filters=dict(has_tag=tags) if tags else None, ) while _jobs: - jobs.extend(_jobs) + if close_reason: + for job in _jobs: + if job.get("close_reason") in close_reason: + jobs.append(job) + else: + jobs.extend(_jobs) + start += 1000 _jobs = client.spider.jobs.list( start=start, From 12b4862a550a0309e495fe3112ce208ad644f0fb Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 18 Apr 2024 16:56:55 +0500 Subject: [PATCH 2/8] handled pr change requests --- spidermon/contrib/scrapy/monitors/monitors.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/spidermon/contrib/scrapy/monitors/monitors.py b/spidermon/contrib/scrapy/monitors/monitors.py index e01610fc..218c6050 100644 --- a/spidermon/contrib/scrapy/monitors/monitors.py +++ b/spidermon/contrib/scrapy/monitors/monitors.py @@ -556,7 +556,7 @@ def run(self, result): def _get_jobs(self, states, number_of_jobs): tags = self._get_tags_to_filter() - close_reason = self.crawler.settings.getlist( + close_reasons = self.crawler.settings.getlist( SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, () ) @@ -571,12 +571,10 @@ def _get_jobs(self, states, number_of_jobs): filters=dict(has_tag=tags) if tags else None, ) while _jobs: - if close_reason: - for job in _jobs: - if job.get("close_reason") in close_reason: - jobs.append(job) - else: - jobs.extend(_jobs) + for job in _jobs: + if close_reasons and job.get("close_reason") not in close_reasons: + continue + jobs.append(job) start += 1000 _jobs = client.spider.jobs.list( From 74e89e55473135a4a18f927dab815bef39087aa5 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 2 May 2024 13:28:31 +0500 Subject: [PATCH 3/8] added test cases --- spidermon/contrib/scrapy/monitors/__init__.py | 1 + .../monitors/test_jobs_comparison_monitor.py | 69 +++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/spidermon/contrib/scrapy/monitors/__init__.py b/spidermon/contrib/scrapy/monitors/__init__.py index 91a3247c..cdbc1c58 100644 --- a/spidermon/contrib/scrapy/monitors/__init__.py +++ b/spidermon/contrib/scrapy/monitors/__init__.py @@ -27,6 +27,7 @@ SPIDERMON_JOBS_COMPARISON_TAGS, SPIDERMON_JOBS_COMPARISON_THRESHOLD, SPIDERMON_ITEM_COUNT_INCREASE, + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ) from .suites import ( SpiderCloseMonitorSuite, diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index b681d8f4..093863ac 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -7,6 +7,7 @@ SPIDERMON_JOBS_COMPARISON_STATES, SPIDERMON_JOBS_COMPARISON_TAGS, SPIDERMON_JOBS_COMPARISON_THRESHOLD, + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ZyteJobsComparisonMonitor, monitors, ) @@ -18,12 +19,31 @@ def mock_jobs(previous_counts): return Mock(return_value=[dict(items=c) for c in previous_counts]) +@pytest.fixture +def mock_jobs_with_close_reason(previous_job_objs, close_reasons): + return Mock( + return_value=[ + dict(items=j["items"], close_reason=j["close_reason"]) + for j in previous_job_objs + if j["close_reason"] in close_reasons + ] + ) + + @pytest.fixture def mock_suite(mock_jobs, monkeypatch): monkeypatch.setattr(ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs) return MonitorSuite(monitors=[ZyteJobsComparisonMonitor]) +@pytest.fixture +def mock_suite_with_close_reason(mock_jobs_with_close_reason, monkeypatch): + monkeypatch.setattr( + ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs_with_close_reason + ) + return MonitorSuite(monitors=[ZyteJobsComparisonMonitor]) + + @pytest.fixture def mock_suite_and_zyte_client( monkeypatch, @@ -153,3 +173,52 @@ def test_arguments_passed_to_zyte_client( ] mock_client.spider.jobs.list.assert_has_calls(calls) + + +@pytest.mark.parametrize( + ["item_count", "previous_job_objs", "threshold", "close_reasons", "should_raise"], + [ + ( + 100, + [ + {"items": 100, "close_reason": "finished"}, + {"items": 50, "close_reason": "cancled"}, + ], + 0.9, + ["finished"], + False, + ), + ( + 100, + [ + {"items": 20, "close_reason": "finished"}, + {"items": 500, "close_reason": "cancled"}, + {"items": 1500, "close_reason": "cancled"}, + ], + 0.9, + ["finished"], + False, + ), + (100, [{"items": 200, "close_reason": "finished"}], 0.9, ["finished"], True), + ], +) +def test_jobs_comparison_monitor_threshold2( + make_data, + mock_suite_with_close_reason, + item_count, + threshold, + close_reasons, + should_raise, +): + data = make_data( + { + SPIDERMON_JOBS_COMPARISON: 1, + SPIDERMON_JOBS_COMPARISON_THRESHOLD: threshold, + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS: "finished", + } + ) + data["stats"]["item_scraped_count"] = item_count + runner = data.pop("runner") + runner.run(mock_suite_with_close_reason, **data) + + assert should_raise == bool(runner.result.monitor_results[0].error) From 6406295745d6a5d55fd0f3ae3fbd42f436237d63 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 2 May 2024 13:30:57 +0500 Subject: [PATCH 4/8] updated testcase name --- tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index 093863ac..c73c5734 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -202,7 +202,7 @@ def test_arguments_passed_to_zyte_client( (100, [{"items": 200, "close_reason": "finished"}], 0.9, ["finished"], True), ], ) -def test_jobs_comparison_monitor_threshold2( +def test_jobs_comparison_monitor_close_reason( make_data, mock_suite_with_close_reason, item_count, From 9df19493a14df207baccd61edbe0b38e890bb78d Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 16 May 2024 16:41:01 +0500 Subject: [PATCH 5/8] fixed test cases --- tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index 8fecfe7e..e6bfa510 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -139,6 +139,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs) # Return exact number of jobs @@ -154,6 +155,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None output = [Mock(), Mock()] mock_client.spider.jobs.list = Mock(return_value=output) @@ -169,6 +171,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs) # Jobs bigger than 1000 From 2114ca40c28577bebbc58f60c90af030baa39c21 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 13 Jun 2024 11:38:00 +0500 Subject: [PATCH 6/8] added new unit test cases --- .../monitors/test_jobs_comparison_monitor.py | 99 ++++++++++--------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index e6bfa510..6efd37fe 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -41,6 +41,24 @@ def mock_suite(mock_jobs, monkeypatch): def get_paginated_jobs(**kwargs): return [Mock() for _ in range(kwargs["count"])] +def get_paginated_jobs_with_finished_close_reason(**kwargs): + objs = [] + for _ in range(kwargs["count"]): + obj = Mock() + obj.get.return_value = "finished" + objs.append(obj) + + return objs + +def get_paginated_jobs_with_cancel_close_reason(**kwargs): + objs = [] + for _ in range(kwargs["count"]): + obj = Mock() + obj.get.return_value = "cancel" + objs.append(obj) + + return objs + @pytest.fixture def mock_suite_with_close_reason(mock_jobs_with_close_reason, monkeypatch): @@ -179,6 +197,38 @@ def test_jobs_comparison_monitor_get_jobs(): assert len(jobs) == 2500 assert mock_client.spider.jobs.list.call_count == 3 + mock_client = Mock() + with patch( + "spidermon.contrib.scrapy.monitors.monitors.Client" + ) as mock_client_class: + mock_client_class.return_value = mock_client + monitor = TestZyteJobsComparisonMonitor() + monitor._get_tags_to_filter = Mock(side_effect=lambda: None) + monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = ["finished"] + mock_client.spider.jobs.list = Mock( + side_effect=get_paginated_jobs_with_finished_close_reason) + + # Return exact number of jobs + jobs = monitor._get_jobs(states=None, number_of_jobs=50) + assert len(jobs) == 50 + + mock_client = Mock() + with patch( + "spidermon.contrib.scrapy.monitors.monitors.Client" + ) as mock_client_class: + mock_client_class.return_value = mock_client + monitor = TestZyteJobsComparisonMonitor() + monitor._get_tags_to_filter = Mock(side_effect=lambda: None) + monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = ["finished"] + mock_client.spider.jobs.list = Mock( + side_effect=get_paginated_jobs_with_cancel_close_reason) + + # Return no jobs as all will be filtered due to close reaseon + jobs = monitor._get_jobs(states=None, number_of_jobs=50) + assert len(jobs) == 0 + @pytest.mark.parametrize( ["item_count", "previous_counts", "threshold", "should_raise"], @@ -247,52 +297,3 @@ def test_arguments_passed_to_zyte_client( ] mock_client.spider.jobs.list.assert_has_calls(calls) - - -@pytest.mark.parametrize( - ["item_count", "previous_job_objs", "threshold", "close_reasons", "should_raise"], - [ - ( - 100, - [ - {"items": 100, "close_reason": "finished"}, - {"items": 50, "close_reason": "cancled"}, - ], - 0.9, - ["finished"], - False, - ), - ( - 100, - [ - {"items": 20, "close_reason": "finished"}, - {"items": 500, "close_reason": "cancled"}, - {"items": 1500, "close_reason": "cancled"}, - ], - 0.9, - ["finished"], - False, - ), - (100, [{"items": 200, "close_reason": "finished"}], 0.9, ["finished"], True), - ], -) -def test_jobs_comparison_monitor_close_reason( - make_data, - mock_suite_with_close_reason, - item_count, - threshold, - close_reasons, - should_raise, -): - data = make_data( - { - SPIDERMON_JOBS_COMPARISON: 1, - SPIDERMON_JOBS_COMPARISON_THRESHOLD: threshold, - SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS: "finished", - } - ) - data["stats"]["item_scraped_count"] = item_count - runner = data.pop("runner") - runner.run(mock_suite_with_close_reason, **data) - - assert should_raise == bool(runner.result.monitor_results[0].error) From 0ebe5f90b3b7bd7dfd47910da601916f1eaeb9b2 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 13 Jun 2024 11:40:09 +0500 Subject: [PATCH 7/8] linting --- .../scrapy/monitors/test_jobs_comparison_monitor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index 6efd37fe..9d455949 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -41,6 +41,7 @@ def mock_suite(mock_jobs, monkeypatch): def get_paginated_jobs(**kwargs): return [Mock() for _ in range(kwargs["count"])] + def get_paginated_jobs_with_finished_close_reason(**kwargs): objs = [] for _ in range(kwargs["count"]): @@ -50,6 +51,7 @@ def get_paginated_jobs_with_finished_close_reason(**kwargs): return objs + def get_paginated_jobs_with_cancel_close_reason(**kwargs): objs = [] for _ in range(kwargs["count"]): @@ -207,7 +209,8 @@ def test_jobs_comparison_monitor_get_jobs(): monitor.data = Mock() monitor.crawler.settings.getlist.return_value = ["finished"] mock_client.spider.jobs.list = Mock( - side_effect=get_paginated_jobs_with_finished_close_reason) + side_effect=get_paginated_jobs_with_finished_close_reason + ) # Return exact number of jobs jobs = monitor._get_jobs(states=None, number_of_jobs=50) @@ -215,7 +218,7 @@ def test_jobs_comparison_monitor_get_jobs(): mock_client = Mock() with patch( - "spidermon.contrib.scrapy.monitors.monitors.Client" + "spidermon.contrib.scrapy.monitors.monitors.Client" ) as mock_client_class: mock_client_class.return_value = mock_client monitor = TestZyteJobsComparisonMonitor() @@ -223,7 +226,8 @@ def test_jobs_comparison_monitor_get_jobs(): monitor.data = Mock() monitor.crawler.settings.getlist.return_value = ["finished"] mock_client.spider.jobs.list = Mock( - side_effect=get_paginated_jobs_with_cancel_close_reason) + side_effect=get_paginated_jobs_with_cancel_close_reason + ) # Return no jobs as all will be filtered due to close reaseon jobs = monitor._get_jobs(states=None, number_of_jobs=50) From 868566ff93b4b9a7fd02f98575e4b0232039fc61 Mon Sep 17 00:00:00 2001 From: Muhammad Shafiq Date: Thu, 27 Jun 2024 17:11:46 +0500 Subject: [PATCH 8/8] updated doc string --- spidermon/contrib/scrapy/monitors/monitors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spidermon/contrib/scrapy/monitors/monitors.py b/spidermon/contrib/scrapy/monitors/monitors.py index c3f269ba..e7679227 100644 --- a/spidermon/contrib/scrapy/monitors/monitors.py +++ b/spidermon/contrib/scrapy/monitors/monitors.py @@ -532,7 +532,8 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor): You can also filter which jobs to compare based on their close reason using the ``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``, - which doesn't filter any job based on close_reason. + which doesn't filter any job based on close_reason. To only consider successfully finished jobs, + use ``("finished", ) instead.`` """ stat_name = "item_scraped_count"