Skip to content

Commit 99a46c9

Browse files
authored
Make table name pattern more robust (#320)
* Make table name pattern more robust * Fix integration tests * Test clean_table_names method
1 parent 2d65461 commit 99a46c9

File tree

3 files changed

+32
-16
lines changed

3 files changed

+32
-16
lines changed

src/ferc_xbrl_extractor/datapackage.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def __hash__(self):
139139
Map callables to schema field type to convert parsed values (Data Package `field.type`).
140140
"""
141141

142-
TABLE_NAME_PATTERN = re.compile("(.+) - Schedule - (.*)") # noqa: W605, FS003
142+
TABLE_NAME_PATTERN = re.compile("(.+)\s+-\s+Schedule\s+-\s+(.*)", re.I) # noqa: W605
143143
"""
144144
Simple regex pattern used to clean up table names.
145145
"""
@@ -226,7 +226,10 @@ def clean_table_names(name: str) -> str | None:
226226
name = _lowercase_words(name)
227227
m = TABLE_NAME_PATTERN.match(name)
228228
if not m:
229-
return None
229+
# Some taxnomies have a table that lists deprecated items. Ignore these tables
230+
if "Deprecated" in name:
231+
return None
232+
raise RuntimeError(f"Error could not parse table name: '{name}'.")
230233

231234
# Rearrange name to be {table_name}_{page_number}
232235
table_name = f"{m.group(2)}_{m.group(1)}"

tests/integration/data_quality_test.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,9 @@ def test_lost_facts_pct(extracted, request):
4444
)
4545
used_fact_ratio = total_used_facts / total_facts
4646

47-
if "form6_" in request.node.name:
48-
# We have unallocated data for Form 6 for some reason.
49-
total_threshold = 0.9
50-
per_filing_threshold = 0.8
51-
# Assert that this is < 0.97 so we remember to fix this test once we
52-
# fix the bug. We don't use xfail here because the parametrization is
53-
# at the *fixture* level, and only the lost facts tests should fail
54-
# for form 6.
55-
assert used_fact_ratio > total_threshold and used_fact_ratio <= 0.97
56-
else:
57-
total_threshold = 0.99
58-
per_filing_threshold = 0.95
59-
assert used_fact_ratio > total_threshold and used_fact_ratio <= 1
47+
total_threshold = 0.99
48+
per_filing_threshold = 0.95
49+
assert used_fact_ratio > total_threshold and used_fact_ratio <= 1
6050

6151
for instance_stats in stats.values():
6252
instance_used_ratio = (

tests/unit/datapackage_test.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas as pd
66
import pytest
77

8-
from ferc_xbrl_extractor.datapackage import Resource, fuzzy_dedup
8+
from ferc_xbrl_extractor.datapackage import Resource, clean_table_names, fuzzy_dedup
99
from ferc_xbrl_extractor.taxonomy import LinkRole
1010

1111
logger = logging.getLogger(__name__)
@@ -190,3 +190,26 @@ def test_fuzzy_dedup_failed_to_resolve():
190190
ValueError, match=r"Fact a:job has values.*'accountant'.*'pringle'.*"
191191
):
192192
fuzzy_dedup(df)
193+
194+
195+
@pytest.mark.parametrize(
196+
"input_name,cleaned_name,is_bad",
197+
[
198+
("001 - Schedule - Test Table Name", "test_table_name_001", False),
199+
("002 - schedule - Lowercase Table Name", "lowercase_table_name_002", False),
200+
(
201+
"003 - Schedule - Weird Space Table Name",
202+
"weird_space_table_name_003",
203+
False,
204+
),
205+
("004 - Deprecated - Deprecated Table Name", None, False),
206+
("005 - Bad - Bad Table Name", None, True),
207+
],
208+
)
209+
def test_clean_table_names(input_name: str, cleaned_name: str | None, is_bad: bool):
210+
"""Test clean_table_names method."""
211+
if is_bad:
212+
with pytest.raises(RuntimeError):
213+
clean_table_names(input_name)
214+
else:
215+
assert clean_table_names(input_name) == cleaned_name

0 commit comments

Comments
 (0)