Skip to content

Commit 331af75

Browse files
fix: change default list of values to be skipped and default behavior of skip falsy to True
Signed-off-by: Nilton Junior <ngm.junior@outlook.com>
1 parent a6116a5 commit 331af75

File tree

3 files changed

+74
-9
lines changed

3 files changed

+74
-9
lines changed

docs/source/settings.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ If this setting is not provided or set to ``False``, spider statistics will be:
185185
186186
SPIDERMON_FIELD_COVERAGE_SKIP_FALSY
187187
-----------------------------------
188-
Default: ``False``
188+
Default: ``True``
189189

190190
When enabled, returned fields that have falsy values (empty strings, empty lists, empty tuples, empty dictionaries, zero, False, etc.) will not be counted as fields with a value.
191191

@@ -238,9 +238,9 @@ If this setting is not provided or set to ``False``, spider statistics will be:
238238
239239
SPIDERMON_FIELD_COVERAGE_SKIP_VALUES
240240
------------------------------------
241-
Default: ``[]``
241+
Default: ``["", [], {}, "N/A", "-"]``
242242

243-
A list of custom values that should not be counted as valid field values when calculating field coverage. This is useful when your items contain placeholder values like "N/A", "-", "TBD", etc. that indicate missing data but are not Python falsy values. You can also skip numeric values like ``0`` or ``-1`` if they represent missing data in your use case.
243+
A list of custom values that should not be counted as valid field values when calculating field coverage. By default, this includes empty string, empty list, empty dict, "N/A", and "-". You can override this to customize which values should be skipped. This is useful when your items contain placeholder values like "TBD", etc. that indicate missing data but are not Python falsy values. You can also skip numeric values like ``0`` or ``-1`` if they represent missing data in your use case.
244244

245245
This setting works in addition to ``SPIDERMON_FIELD_COVERAGE_SKIP_NONE`` and ``SPIDERMON_FIELD_COVERAGE_SKIP_FALSY``. Values are matched using exact equality (``==``), so type matters (e.g., the string ``"0"`` is different from the integer ``0``).
246246

@@ -285,7 +285,7 @@ If this setting is set to ``["N/A", "-", "TBD"]``, spider statistics will be:
285285
'spidermon_field_coverage/dict/field_3': 0.5, # Ignored "-"
286286
'spidermon_field_coverage/dict/field_4': 0.5, # Ignored "TBD"
287287
288-
If this setting is not provided or set to an empty list, spider statistics will be:
288+
If you want to override the default skip values, you can set this to a custom list. If set to an empty list ``[]``, no custom skip values will be used (only falsy values will be skipped if ``SPIDERMON_FIELD_COVERAGE_SKIP_FALSY`` is enabled). If not provided, the default values will be used. Without the default skip values, spider statistics would be:
289289

290290
.. code-block:: python
291291

spidermon/contrib/scrapy/extensions.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,31 @@ def __init__(
6060
self.periodic_tasks = {}
6161
self.client = Client(self.crawler.settings)
6262

63+
@staticmethod
64+
def _get_default_skip_values():
65+
"""Get the default skip values array.
66+
67+
Default skip values: empty string, empty list, empty dict, 'N/A', '-'
68+
"""
69+
return ["", [], {}, "N/A", "-"]
70+
6371
def _get_skip_values_list(self, settings):
6472
"""Get skip values list, supporting Python lists, JSON strings, and
6573
comma-separated strings.
6674
6775
This allows preserving types (e.g., integers) when provided as Python
6876
lists or JSON strings, while still supporting comma-separated strings
6977
for backward compatibility.
78+
79+
Default skip values: empty string, empty list, empty dict, 'N/A', '-'
7080
"""
71-
value = settings.get("SPIDERMON_FIELD_COVERAGE_SKIP_VALUES", [])
81+
# Default skip values
82+
default_skip_values = self._get_default_skip_values()
83+
84+
value = settings.get("SPIDERMON_FIELD_COVERAGE_SKIP_VALUES", None)
85+
if value is None:
86+
return default_skip_values
87+
7288
if not value:
7389
return []
7490

@@ -194,7 +210,7 @@ def _count_item(
194210
if skip_none_values and value is None:
195211
continue
196212

197-
if skip_falsy_values and not value:
213+
if skip_falsy_values and value in self._get_default_skip_values():
198214
continue
199215

200216
if value in skip_values:
@@ -264,9 +280,24 @@ def item_scraped(self, item, response, spider):
264280
"SPIDERMON_FIELD_COVERAGE_SKIP_NONE", False
265281
)
266282
skip_falsy_values = spider.crawler.settings.getbool(
267-
"SPIDERMON_FIELD_COVERAGE_SKIP_FALSY", False
283+
"SPIDERMON_FIELD_COVERAGE_SKIP_FALSY", True
284+
)
285+
# Check if SPIDERMON_FIELD_COVERAGE_SKIP_VALUES was explicitly set
286+
# to empty list. If so, disable default skip values check.
287+
skip_values_setting = spider.crawler.settings.get(
288+
"SPIDERMON_FIELD_COVERAGE_SKIP_VALUES"
268289
)
269290
skip_values = self._get_skip_values_list(spider.crawler.settings)
291+
if (
292+
skip_values_setting is not None
293+
and isinstance(skip_values_setting, list)
294+
and skip_values_setting == []
295+
and skip_values == []
296+
):
297+
# Setting was explicitly set to empty list, disable default
298+
# skip values (skip_falsy_values should not apply)
299+
skip_falsy_values = False
300+
270301
list_field_coverage_levels = spider.crawler.settings.getint(
271302
"SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS", 0
272303
)

tests/test_item_scraped_signal.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,8 @@ def test_item_scraped_count_ignore_custom_skip_values():
552552
) # Only the valid values (skipped "TBD")
553553

554554

555-
def test_item_scraped_count_do_not_ignore_custom_skip_values_by_default():
555+
def test_item_scraped_count_ignore_default_skip_values():
556+
"""Test that default skip values (empty string, empty list, empty dict, N/A, -) are applied"""
556557
settings = {
557558
"SPIDERMON_ENABLED": True,
558559
"EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 100},
@@ -562,6 +563,39 @@ def test_item_scraped_count_do_not_ignore_custom_skip_values_by_default():
562563
crawler = get_crawler(settings_dict=settings)
563564
spider = Spider.from_crawler(crawler, "example.com")
564565

566+
returned_items = [
567+
{"field1": "value1", "field2": "N/A", "field3": "-", "field4": ""},
568+
{"field1": "value1", "field2": "value2", "field3": "value3", "field4": "value4"},
569+
]
570+
571+
for item in returned_items:
572+
spider.crawler.signals.send_catch_log_deferred(
573+
signal=signals.item_scraped,
574+
item=item,
575+
response="",
576+
spider=spider,
577+
)
578+
579+
stats = spider.crawler.stats.get_stats()
580+
581+
assert stats.get("spidermon_item_scraped_count/dict/field1") == 2
582+
assert stats.get("spidermon_item_scraped_count/dict/field2") == 1 # "N/A" skipped by default
583+
assert stats.get("spidermon_item_scraped_count/dict/field3") == 1 # "-" skipped by default
584+
assert stats.get("spidermon_item_scraped_count/dict/field4") == 1 # "" skipped by default
585+
586+
587+
def test_item_scraped_count_do_not_ignore_custom_skip_values_when_empty_list():
588+
"""Test that setting skip_values to empty list disables default skip values"""
589+
settings = {
590+
"SPIDERMON_ENABLED": True,
591+
"EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 100},
592+
"SPIDERMON_ADD_FIELD_COVERAGE": True,
593+
"SPIDERMON_FIELD_COVERAGE_SKIP_VALUES": [],
594+
}
595+
596+
crawler = get_crawler(settings_dict=settings)
597+
spider = Spider.from_crawler(crawler, "example.com")
598+
565599
returned_items = [
566600
{"field1": "value1", "field2": "N/A"},
567601
{"field1": "value1", "field2": "-"},
@@ -580,7 +614,7 @@ def test_item_scraped_count_do_not_ignore_custom_skip_values_by_default():
580614
assert stats.get("spidermon_item_scraped_count/dict/field1") == 2
581615
assert (
582616
stats.get("spidermon_item_scraped_count/dict/field2") == 2
583-
) # Counted because skip_values not set
617+
) # Counted because skip_values is empty list
584618

585619

586620
def test_item_scraped_count_skip_values_type_sensitive():

0 commit comments

Comments
 (0)