Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[434] Allow to filter jobs in ZyteJobsComparisonMonitor by close_reason #440

Merged
merged 13 commits into from
Jul 15, 2024
1 change: 1 addition & 0 deletions spidermon/contrib/scrapy/monitors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
SPIDERMON_JOBS_COMPARISON_TAGS,
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_ITEM_COUNT_INCREASE,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
)
from .suites import (
SpiderCloseMonitorSuite,
Expand Down
17 changes: 15 additions & 2 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SPIDERMON_JOBS_COMPARISON = "SPIDERMON_JOBS_COMPARISON"
SPIDERMON_JOBS_COMPARISON_STATES = "SPIDERMON_JOBS_COMPARISON_STATES"
SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS"
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS"
SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD"
SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE"

Expand Down Expand Up @@ -528,6 +529,11 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor):
You can also filter which jobs to compare based on their tags using the
``SPIDERMON_JOBS_COMPARISON_TAGS`` setting. Among the defined tags we consider only those
that are also present in the current job.

You can also filter which jobs to compare based on their close reason using the
``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``,
which doesn't filter any job based on close_reason. To only consider successfully finished jobs,
use ``("finished", ) instead.``
"""

stat_name = "item_scraped_count"
Expand Down Expand Up @@ -556,6 +562,9 @@ def run(self, result):

def _get_jobs(self, states, number_of_jobs):
tags = self._get_tags_to_filter()
close_reasons = self.crawler.settings.getlist(
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ()
)

total_jobs = []
start = 0
Expand All @@ -571,9 +580,13 @@ def _get_jobs(self, states, number_of_jobs):
count=count,
has_tag=tags or None,
)
total_jobs.extend(current_jobs)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) == number_of_jobs:
for job in current_jobs:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought: This makes sense since it seems we can't filter by close_reason in client.spider.jobs.list().

I wonder if it makes sense to retrieve MAX_API_COUNT count jobs each time from the API and append to total_jobs the jobs that meet the criteria until we reach number_of_jobs. This should result in fewer API calls, but those calls will be larger. What do you think, @VMRuiz?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the default close reason is finished I would keep it as it is. As the most common scenario will be that you fetch the last few jobs and those will have the finished status. Even in the case where a second call is needed - due to a few unexpected failed jobs in between - it could still be faster for lower volumes than making 1 very big call.

On the contrary, if it's expected that uncommon close reasons like timeout are used, then it makes sense to fetch as many jobs as possible, as they will be few or even none.

if close_reasons and job.get("close_reason") not in close_reasons:
continue
total_jobs.append(job)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) >= number_of_jobs:
# Stop paginating if results are less than 1000 (pagination not required)
# or target jobs was reached - no more pagination required
break
Expand Down
77 changes: 77 additions & 0 deletions tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
SPIDERMON_JOBS_COMPARISON_STATES,
SPIDERMON_JOBS_COMPARISON_TAGS,
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
ZyteJobsComparisonMonitor,
monitors,
)
Expand All @@ -20,6 +21,17 @@ def mock_jobs(previous_counts):
return Mock(return_value=[dict(items=c) for c in previous_counts])


@pytest.fixture
def mock_jobs_with_close_reason(previous_job_objs, close_reasons):
return Mock(
return_value=[
dict(items=j["items"], close_reason=j["close_reason"])
for j in previous_job_objs
if j["close_reason"] in close_reasons
]
)


@pytest.fixture
def mock_suite(mock_jobs, monkeypatch):
monkeypatch.setattr(ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs)
Expand All @@ -30,6 +42,34 @@ def get_paginated_jobs(**kwargs):
return [Mock() for _ in range(kwargs["count"])]


def get_paginated_jobs_with_finished_close_reason(**kwargs):
objs = []
for _ in range(kwargs["count"]):
obj = Mock()
obj.get.return_value = "finished"
objs.append(obj)

return objs


def get_paginated_jobs_with_cancel_close_reason(**kwargs):
objs = []
for _ in range(kwargs["count"]):
obj = Mock()
obj.get.return_value = "cancel"
objs.append(obj)

return objs


@pytest.fixture
def mock_suite_with_close_reason(mock_jobs_with_close_reason, monkeypatch):
monkeypatch.setattr(
ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs_with_close_reason
)
return MonitorSuite(monitors=[ZyteJobsComparisonMonitor])


@pytest.fixture
def mock_suite_and_zyte_client(
monkeypatch,
Expand Down Expand Up @@ -119,6 +159,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
Expand All @@ -134,6 +175,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
output = [Mock(), Mock()]
mock_client.spider.jobs.list = Mock(return_value=output)

Expand All @@ -149,13 +191,48 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Jobs bigger than 1000
jobs = monitor._get_jobs(states=None, number_of_jobs=2500)
assert len(jobs) == 2500
assert mock_client.spider.jobs.list.call_count == 3

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_finished_close_reason
)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_cancel_close_reason
)

# Return no jobs as all will be filtered due to close reaseon
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 0


@pytest.mark.parametrize(
["item_count", "previous_counts", "threshold", "should_raise"],
Expand Down
Loading