fix: rename _EN_DASH to _MINUS_SIGN, add missing tests for unicode minus handling

augustjohnson · augustjohnson · commit 339e0637a6b4 · 2026-05-22T17:57:29.000-05:00
- Rename _EN_DASH constant to _MINUS_SIGN (U+2212 MINUS SIGN, not EN DASH)
- Update all references in clean_text() and parse_dice() functions
- Fix docstrings to accurately describe Unicode minus sign (U+2212)
- Add test_replaces_unicode_minus_sign to TestCleanText
- Add test_with_unicode_minus_bonus to TestParseDice for PDF modifiers
- Add test_with_uppercase_d to handle both lowercase and uppercase 'd'
- Add TestExtractFullText class with mocked tests for table sentinel injection
- Fix slugify() regex to explicitly exclude underscores: [^a-zA-Z0-9\s-]
- Enable uppercase 'D' in dice notation regex: [dD]
diff --git a/api_v2/tests/test_srd_parsers.py b/api_v2/tests/test_srd_parsers.py
@@ -2,6 +2,7 @@
 import sys
 import os
 import pytest
+from unittest.mock import patch, MagicMock
 
 # Make the parsers package importable without installing it
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..'))
@@ -12,6 +13,7 @@
     parse_cost,
     parse_dice,
     extract_section,
+    extract_full_text,
 )
 
 
@@ -31,6 +33,10 @@ def test_normalizes_whitespace(self):
     def test_passthrough_normal_text(self):
         assert clean_text("Acid Arrow") == "Acid Arrow"
 
+    def test_replaces_unicode_minus_sign(self):
+        # U+2212 MINUS SIGN appears in PDF ability score modifiers like "−8"
+        assert clean_text("−8") == "-8"
+
 
 class TestSlugify:
     def test_lowercases(self):
@@ -88,6 +94,14 @@ def test_no_match_returns_none(self):
         assert parse_dice("") is None
         assert parse_dice("some text") is None
 
+    def test_with_unicode_minus_bonus(self):
+        # PDF ability modifiers use U+2212 MINUS SIGN, not ASCII hyphen-minus
+        assert parse_dice("1d8−1") == {"count": 1, "die": 8, "bonus": -1}
+
+    def test_with_uppercase_d(self):
+        assert parse_dice("2D6") == {"count": 2, "die": 6, "bonus": 0}
+        assert parse_dice("1D8+2") == {"count": 1, "die": 8, "bonus": 2}
+
 
 class TestExtractSection:
     def test_extracts_between_markers(self):
@@ -107,3 +121,24 @@ def test_raises_if_end_not_found(self):
         with pytest.raises(ValueError, match="end marker"):
             import re
             extract_section("START here", re.compile(r"START"), re.compile(r"MISSING"))
+
+
+class TestExtractFullText:
+    def test_injects_table_row_sentinels(self):
+        fake_page = MagicMock()
+        fake_page.extract_text.return_value = "page text"
+        fake_page.extract_tables.return_value = [[["27", "14", "25"]]]
+        with patch("pdfplumber.open") as mock_open:
+            mock_open.return_value.__enter__.return_value.pages = [fake_page]
+            result = extract_full_text("dummy.pdf")
+        assert "§TABLE_ROW§27|14|25§" in result
+        assert "page text" in result
+
+    def test_skips_empty_table_rows(self):
+        fake_page = MagicMock()
+        fake_page.extract_text.return_value = "text"
+        fake_page.extract_tables.return_value = [[[None, None, None]]]
+        with patch("pdfplumber.open") as mock_open:
+            mock_open.return_value.__enter__.return_value.pages = [fake_page]
+            result = extract_full_text("dummy.pdf")
+        assert "§TABLE_ROW§" not in result
diff --git a/data/raw_sources/srd_5_2/parsers/base.py b/data/raw_sources/srd_5_2/parsers/base.py
@@ -12,16 +12,16 @@
     "ﬄ": "ffl",
 }
 
-# Unicode en-dash used as minus sign in PDF ability score modifiers
-_EN_DASH = "−"
+# Unicode minus sign (U+2212) used in PDF ability score modifiers
+_MINUS_SIGN = "−"
 
 
 def clean_text(s: str) -> str:
-    """Strip ligatures, soft hyphens, en-dash → ASCII minus, normalize whitespace."""
+    """Strip ligatures, soft hyphens, Unicode minus sign (U+2212) → ASCII minus, normalize whitespace."""
     for ligature, replacement in LIGATURE_MAP.items():
         s = s.replace(ligature, replacement)
     s = s.replace("\xad", "")  # soft hyphen
-    s = s.replace(_EN_DASH, "-")  # en-dash → ASCII minus
+    s = s.replace(_MINUS_SIGN, "-")  # Unicode minus sign (U+2212) → ASCII minus
     s = unicodedata.normalize("NFC", s)
     s = re.sub(r"\s+", " ", s).strip()
     return s
@@ -30,7 +30,7 @@ def clean_text(s: str) -> str:
 def slugify(name: str) -> str:
     """Normalize a name to a URL-safe slug for dict-key matching."""
     s = clean_text(name).lower()
-    s = re.sub(r"[^\w\s-]", "", s)
+    s = re.sub(r"[^a-zA-Z0-9\s-]", "", s)
     s = re.sub(r"\s+", "-", s.strip())
     s = re.sub(r"-+", "-", s)
     return s
@@ -45,10 +45,10 @@ def parse_cost(s: str) -> dict | None:
 
 
 def parse_dice(s: str) -> dict | None:
-    """Parse '2d6+3' or '1d8-1' (also with en-dash) into {"count": int, "die": int, "bonus": int}."""
-    # Normalize en-dash to ASCII minus before matching
-    s = s.replace(_EN_DASH, "-")
-    m = re.search(r"(\d+)d(\d+)([+-]\d+)?", s)
+    """Parse '2d6+3' or '1d8-1' (also with Unicode minus sign U+2212) into {"count": int, "die": int, "bonus": int}."""
+    # Normalize Unicode minus sign (U+2212) to ASCII minus before matching
+    s = s.replace(_MINUS_SIGN, "-")
+    m = re.search(r"(\d+)[dD](\d+)([+-]\d+)?", s)
     if not m:
         return None
     return {