Skip to content

Commit 8cd7953

Browse files
[Issue-435] Job Argument filter for ZyteJobsComparisonMonitor (#461)
* added filter on job comparison monitor based on job arguments * added doc string * added test cases * linting * added new test cases * removed unwanted code * updated monitor to compare arguments dict not just the keys * added new test case * linting --------- Co-authored-by: Víctor Ruiz <victorruiz@zyte.com>
1 parent 19a11f7 commit 8cd7953

File tree

3 files changed

+172
-1
lines changed

3 files changed

+172
-1
lines changed

spidermon/contrib/scrapy/monitors/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
2929
SPIDERMON_ITEM_COUNT_INCREASE,
3030
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
31+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS,
3132
)
3233
from .suites import (
3334
SpiderCloseMonitorSuite,

spidermon/contrib/scrapy/monitors/monitors.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS"
2424
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS"
2525
SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD"
26+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS = "SPIDERMON_JOBS_COMPARISON_ARGUMENTS"
27+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED = (
28+
"SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED"
29+
)
2630
SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE"
2731

2832

@@ -534,6 +538,13 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor):
534538
``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``,
535539
which doesn't filter any job based on close_reason. To only consider successfully finished jobs,
536540
use ``("finished", ) instead.``
541+
542+
You can also filter which jobs to compare based on the job arguments using the
543+
``SPIDERMON_JOBS_COMPARISON_ARGUMENTS`` setting. It will filter any job based on spider_args.
544+
The job that will have all the desired arguments will be processed.
545+
Example {"debug_url": "https://www.google.com"} or {"is_full_crawl": True}
546+
You can enable this filter by setting SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED as True in the settings.
547+
Otherwise, this filter will not be applied
537548
"""
538549

539550
stat_name = "item_scraped_count"
@@ -565,6 +576,10 @@ def _get_jobs(self, states, number_of_jobs):
565576
close_reasons = self.crawler.settings.getlist(
566577
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ()
567578
)
579+
args = self._get_args_to_filter()
580+
args_enabled = self.crawler.settings.getbool(
581+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED, False
582+
)
568583

569584
total_jobs = []
570585
start = 0
@@ -584,6 +599,10 @@ def _get_jobs(self, states, number_of_jobs):
584599
for job in current_jobs:
585600
if close_reasons and job.get("close_reason") not in close_reasons:
586601
continue
602+
603+
if args_enabled and not self._has_desired_args(job, args):
604+
continue
605+
587606
total_jobs.append(job)
588607

589608
if len(current_jobs) < MAX_API_COUNT or len(total_jobs) >= number_of_jobs:
@@ -611,6 +630,30 @@ def _get_tags_to_filter(self):
611630
tags_to_filter = set(desired_tags) & set(current_tags)
612631
return list(sorted(tags_to_filter))
613632

633+
def _get_args_to_filter(self):
634+
"""
635+
Return a list of desired arguments to filter
636+
"""
637+
desired_args = self.crawler.settings.getdict(
638+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS
639+
)
640+
if not desired_args:
641+
return {}
642+
643+
return desired_args
644+
645+
def _has_desired_args(self, job, args):
646+
if not args and not job.get("spider_args"):
647+
return True
648+
elif not args and job.get("spider_args"):
649+
return False
650+
651+
job_args = job["spider_args"].keys()
652+
if not all(a in job_args for a in args):
653+
return False
654+
655+
return args == job["spider_args"]
656+
614657
def get_threshold(self):
615658
number_of_jobs = self.crawler.settings.getint(SPIDERMON_JOBS_COMPARISON)
616659

tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
SPIDERMON_JOBS_COMPARISON_STATES,
99
SPIDERMON_JOBS_COMPARISON_TAGS,
1010
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
11+
SPIDERMON_JOBS_COMPARISON_ARGUMENTS,
1112
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
1213
ZyteJobsComparisonMonitor,
1314
monitors,
@@ -39,7 +40,28 @@ def mock_suite(mock_jobs, monkeypatch):
3940

4041

4142
def get_paginated_jobs(**kwargs):
42-
return [Mock() for _ in range(kwargs["count"])]
43+
mocked_job_meta = []
44+
for _ in range(kwargs["count"]):
45+
mocked_job_meta.append({"spider_args": {}})
46+
return mocked_job_meta
47+
48+
49+
def get_paginated_jobs_with_one_args(**kwargs):
50+
mocked_job_meta = []
51+
for _ in range(kwargs["count"]):
52+
mocked_job_meta.append(
53+
{"spider_args": {"args1": True}, "close_reason": "finished"}
54+
)
55+
return mocked_job_meta
56+
57+
58+
def get_paginated_jobs_arg_finished(**kwargs):
59+
mocked_job_meta = []
60+
for _ in range(kwargs["count"]):
61+
mocked_job_meta.append(
62+
{"spider_args": {"finished": True}, "close_reason": "finished"}
63+
)
64+
return mocked_job_meta
4365

4466

4567
def get_paginated_jobs_with_finished_close_reason(**kwargs):
@@ -160,6 +182,7 @@ def test_jobs_comparison_monitor_get_jobs():
160182
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
161183
monitor.data = Mock()
162184
monitor.crawler.settings.getlist.return_value = None
185+
monitor.crawler.settings.getbool.return_value = False
163186
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)
164187

165188
# Return exact number of jobs
@@ -176,6 +199,7 @@ def test_jobs_comparison_monitor_get_jobs():
176199
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
177200
monitor.data = Mock()
178201
monitor.crawler.settings.getlist.return_value = None
202+
monitor.crawler.settings.getbool.return_value = False
179203
output = [Mock(), Mock()]
180204
mock_client.spider.jobs.list = Mock(return_value=output)
181205

@@ -192,6 +216,7 @@ def test_jobs_comparison_monitor_get_jobs():
192216
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
193217
monitor.data = Mock()
194218
monitor.crawler.settings.getlist.return_value = None
219+
monitor.crawler.settings.getbool.return_value = False
195220
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)
196221

197222
# Jobs bigger than 1000
@@ -208,6 +233,7 @@ def test_jobs_comparison_monitor_get_jobs():
208233
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
209234
monitor.data = Mock()
210235
monitor.crawler.settings.getlist.return_value = ["finished"]
236+
monitor.crawler.settings.getbool.return_value = False
211237
mock_client.spider.jobs.list = Mock(
212238
side_effect=get_paginated_jobs_with_finished_close_reason
213239
)
@@ -225,6 +251,7 @@ def test_jobs_comparison_monitor_get_jobs():
225251
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
226252
monitor.data = Mock()
227253
monitor.crawler.settings.getlist.return_value = ["finished"]
254+
monitor.crawler.settings.getbool.return_value = False
228255
mock_client.spider.jobs.list = Mock(
229256
side_effect=get_paginated_jobs_with_cancel_close_reason
230257
)
@@ -233,6 +260,106 @@ def test_jobs_comparison_monitor_get_jobs():
233260
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
234261
assert len(jobs) == 0
235262

263+
mock_client = Mock()
264+
with patch(
265+
"spidermon.contrib.scrapy.monitors.monitors.Client"
266+
) as mock_client_class:
267+
mock_client_class.return_value = mock_client
268+
monitor = TestZyteJobsComparisonMonitor()
269+
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
270+
monitor.data = Mock()
271+
monitor.crawler.settings.getdict.return_value = {}
272+
monitor.crawler.settings.getlist.return_value = None
273+
monitor.crawler.settings.getbool.return_value = True
274+
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)
275+
276+
# Return exact number of jobs
277+
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
278+
assert len(jobs) == 50
279+
mock_client.spider.jobs.list.assert_called_once()
280+
281+
mock_client = Mock()
282+
with patch(
283+
"spidermon.contrib.scrapy.monitors.monitors.Client"
284+
) as mock_client_class:
285+
mock_client_class.return_value = mock_client
286+
monitor = TestZyteJobsComparisonMonitor()
287+
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
288+
monitor.data = Mock()
289+
monitor.crawler.settings.getdict.return_value = {"finished": True}
290+
monitor.crawler.settings.getlist.return_value = ["finished"]
291+
monitor.crawler.settings.getbool.return_value = True
292+
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)
293+
294+
# Return exact number of jobs
295+
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
296+
assert len(jobs) == 50
297+
mock_client.spider.jobs.list.assert_called_once()
298+
299+
mock_client = Mock()
300+
with patch(
301+
"spidermon.contrib.scrapy.monitors.monitors.Client"
302+
) as mock_client_class:
303+
mock_client_class.return_value = mock_client
304+
monitor = TestZyteJobsComparisonMonitor()
305+
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
306+
monitor.data = Mock()
307+
monitor.crawler.settings.getdict.return_value = {"finished": False}
308+
monitor.crawler.settings.getlist.return_value = ["finished"]
309+
monitor.crawler.settings.getbool.return_value = True
310+
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)
311+
312+
# Return 0 number of jobs as argument values did not matched
313+
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
314+
assert len(jobs) == 0
315+
mock_client.spider.jobs.list.assert_called_once()
316+
317+
mock_client = Mock()
318+
with patch(
319+
"spidermon.contrib.scrapy.monitors.monitors.Client"
320+
) as mock_client_class:
321+
mock_client_class.return_value = mock_client
322+
monitor = TestZyteJobsComparisonMonitor()
323+
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
324+
monitor.data = Mock()
325+
326+
def mock_getlist(key, default=None):
327+
data = {
328+
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS: ["finished"],
329+
}
330+
return data.get(key, default)
331+
332+
monitor.crawler.settings = Mock()
333+
monitor.crawler.settings.getlist.side_effect = mock_getlist
334+
monitor.crawler.settings.getdict.return_value = {}
335+
monitor.crawler.settings.getbool.return_value = True
336+
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)
337+
338+
# Return 0 number of jobs
339+
jobs = monitor._get_jobs(states=None, number_of_jobs=5)
340+
assert len(jobs) == 0
341+
mock_client.spider.jobs.list.assert_called_once()
342+
343+
mock_client = Mock()
344+
with patch(
345+
"spidermon.contrib.scrapy.monitors.monitors.Client"
346+
) as mock_client_class:
347+
mock_client_class.return_value = mock_client
348+
monitor = TestZyteJobsComparisonMonitor()
349+
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
350+
monitor.data = Mock()
351+
352+
monitor.crawler.settings = Mock()
353+
monitor.crawler.settings.getlist.return_value = ["finished"]
354+
monitor.crawler.settings.getdict.return_value = {"is_debug": False}
355+
monitor.crawler.settings.getbool.return_value = True
356+
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)
357+
358+
# Return 0 number of jobs
359+
jobs = monitor._get_jobs(states=None, number_of_jobs=5)
360+
assert len(jobs) == 0
361+
mock_client.spider.jobs.list.assert_called_once()
362+
236363

237364
@pytest.mark.parametrize(
238365
["item_count", "previous_counts", "threshold", "should_raise"],

0 commit comments

Comments
 (0)