Make table name pattern more robust (#320)

zschira · web-flow · commit 99a46c97361d · 2025-04-28T12:51:18.000-04:00
* Make table name pattern more robust

* Fix integration tests

* Test clean_table_names method
diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py
@@ -139,7 +139,7 @@ def __hash__(self):
 Map callables to schema field type to convert parsed values (Data Package `field.type`).
 """
 
-TABLE_NAME_PATTERN = re.compile("(.+) - Schedule - (.*)")  # noqa: W605, FS003
+TABLE_NAME_PATTERN = re.compile("(.+)\s+-\s+Schedule\s+-\s+(.*)", re.I)  # noqa: W605
 """
 Simple regex pattern used to clean up table names.
 """
@@ -226,7 +226,10 @@ def clean_table_names(name: str) -> str | None:
     name = _lowercase_words(name)
     m = TABLE_NAME_PATTERN.match(name)
     if not m:
-        return None
+        # Some taxnomies have a table that lists deprecated items. Ignore these tables
+        if "Deprecated" in name:
+            return None
+        raise RuntimeError(f"Error could not parse table name: '{name}'.")
 
     # Rearrange name to be {table_name}_{page_number}
     table_name = f"{m.group(2)}_{m.group(1)}"
diff --git a/tests/integration/data_quality_test.py b/tests/integration/data_quality_test.py
@@ -44,19 +44,9 @@ def test_lost_facts_pct(extracted, request):
     )
     used_fact_ratio = total_used_facts / total_facts
 
-    if "form6_" in request.node.name:
-        # We have unallocated data for Form 6 for some reason.
-        total_threshold = 0.9
-        per_filing_threshold = 0.8
-        # Assert that this is < 0.97 so we remember to fix this test once we
-        # fix the bug. We don't use xfail here because the parametrization is
-        # at the *fixture* level, and only the lost facts tests should fail
-        # for form 6.
-        assert used_fact_ratio > total_threshold and used_fact_ratio <= 0.97
-    else:
-        total_threshold = 0.99
-        per_filing_threshold = 0.95
-        assert used_fact_ratio > total_threshold and used_fact_ratio <= 1
+    total_threshold = 0.99
+    per_filing_threshold = 0.95
+    assert used_fact_ratio > total_threshold and used_fact_ratio <= 1
 
     for instance_stats in stats.values():
         instance_used_ratio = (
diff --git a/tests/unit/datapackage_test.py b/tests/unit/datapackage_test.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 
-from ferc_xbrl_extractor.datapackage import Resource, fuzzy_dedup
+from ferc_xbrl_extractor.datapackage import Resource, clean_table_names, fuzzy_dedup
 from ferc_xbrl_extractor.taxonomy import LinkRole
 
 logger = logging.getLogger(__name__)
@@ -190,3 +190,26 @@ def test_fuzzy_dedup_failed_to_resolve():
         ValueError, match=r"Fact a:job has values.*'accountant'.*'pringle'.*"
     ):
         fuzzy_dedup(df)
+
+
+@pytest.mark.parametrize(
+    "input_name,cleaned_name,is_bad",
+    [
+        ("001 - Schedule - Test Table Name", "test_table_name_001", False),
+        ("002 - schedule - Lowercase Table Name", "lowercase_table_name_002", False),
+        (
+            "003 -    Schedule  - Weird Space Table Name",
+            "weird_space_table_name_003",
+            False,
+        ),
+        ("004 - Deprecated - Deprecated Table Name", None, False),
+        ("005 - Bad - Bad Table Name", None, True),
+    ],
+)
+def test_clean_table_names(input_name: str, cleaned_name: str | None, is_bad: bool):
+    """Test clean_table_names method."""
+    if is_bad:
+        with pytest.raises(RuntimeError):
+            clean_table_names(input_name)
+    else:
+        assert clean_table_names(input_name) == cleaned_name