Skip to content

Commit 339e063

Browse files
committed
fix: rename _EN_DASH to _MINUS_SIGN, add missing tests for unicode minus handling
- Rename _EN_DASH constant to _MINUS_SIGN (U+2212 MINUS SIGN, not EN DASH) - Update all references in clean_text() and parse_dice() functions - Fix docstrings to accurately describe Unicode minus sign (U+2212) - Add test_replaces_unicode_minus_sign to TestCleanText - Add test_with_unicode_minus_bonus to TestParseDice for PDF modifiers - Add test_with_uppercase_d to handle both lowercase and uppercase 'd' - Add TestExtractFullText class with mocked tests for table sentinel injection - Fix slugify() regex to explicitly exclude underscores: [^a-zA-Z0-9\s-] - Enable uppercase 'D' in dice notation regex: [dD]
1 parent 341edef commit 339e063

2 files changed

Lines changed: 44 additions & 9 deletions

File tree

api_v2/tests/test_srd_parsers.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import sys
33
import os
44
import pytest
5+
from unittest.mock import patch, MagicMock
56

67
# Make the parsers package importable without installing it
78
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..'))
@@ -12,6 +13,7 @@
1213
parse_cost,
1314
parse_dice,
1415
extract_section,
16+
extract_full_text,
1517
)
1618

1719

@@ -31,6 +33,10 @@ def test_normalizes_whitespace(self):
3133
def test_passthrough_normal_text(self):
3234
assert clean_text("Acid Arrow") == "Acid Arrow"
3335

36+
def test_replaces_unicode_minus_sign(self):
37+
# U+2212 MINUS SIGN appears in PDF ability score modifiers like "−8"
38+
assert clean_text("−8") == "-8"
39+
3440

3541
class TestSlugify:
3642
def test_lowercases(self):
@@ -88,6 +94,14 @@ def test_no_match_returns_none(self):
8894
assert parse_dice("") is None
8995
assert parse_dice("some text") is None
9096

97+
def test_with_unicode_minus_bonus(self):
98+
# PDF ability modifiers use U+2212 MINUS SIGN, not ASCII hyphen-minus
99+
assert parse_dice("1d8−1") == {"count": 1, "die": 8, "bonus": -1}
100+
101+
def test_with_uppercase_d(self):
102+
assert parse_dice("2D6") == {"count": 2, "die": 6, "bonus": 0}
103+
assert parse_dice("1D8+2") == {"count": 1, "die": 8, "bonus": 2}
104+
91105

92106
class TestExtractSection:
93107
def test_extracts_between_markers(self):
@@ -107,3 +121,24 @@ def test_raises_if_end_not_found(self):
107121
with pytest.raises(ValueError, match="end marker"):
108122
import re
109123
extract_section("START here", re.compile(r"START"), re.compile(r"MISSING"))
124+
125+
126+
class TestExtractFullText:
127+
def test_injects_table_row_sentinels(self):
128+
fake_page = MagicMock()
129+
fake_page.extract_text.return_value = "page text"
130+
fake_page.extract_tables.return_value = [[["27", "14", "25"]]]
131+
with patch("pdfplumber.open") as mock_open:
132+
mock_open.return_value.__enter__.return_value.pages = [fake_page]
133+
result = extract_full_text("dummy.pdf")
134+
assert "§TABLE_ROW§27|14|25§" in result
135+
assert "page text" in result
136+
137+
def test_skips_empty_table_rows(self):
138+
fake_page = MagicMock()
139+
fake_page.extract_text.return_value = "text"
140+
fake_page.extract_tables.return_value = [[[None, None, None]]]
141+
with patch("pdfplumber.open") as mock_open:
142+
mock_open.return_value.__enter__.return_value.pages = [fake_page]
143+
result = extract_full_text("dummy.pdf")
144+
assert "§TABLE_ROW§" not in result

data/raw_sources/srd_5_2/parsers/base.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@
1212
"ffl": "ffl",
1313
}
1414

15-
# Unicode en-dash used as minus sign in PDF ability score modifiers
16-
_EN_DASH = "−"
15+
# Unicode minus sign (U+2212) used in PDF ability score modifiers
16+
_MINUS_SIGN = "−"
1717

1818

1919
def clean_text(s: str) -> str:
20-
"""Strip ligatures, soft hyphens, en-dash → ASCII minus, normalize whitespace."""
20+
"""Strip ligatures, soft hyphens, Unicode minus sign (U+2212) → ASCII minus, normalize whitespace."""
2121
for ligature, replacement in LIGATURE_MAP.items():
2222
s = s.replace(ligature, replacement)
2323
s = s.replace("\xad", "") # soft hyphen
24-
s = s.replace(_EN_DASH, "-") # en-dash → ASCII minus
24+
s = s.replace(_MINUS_SIGN, "-") # Unicode minus sign (U+2212) → ASCII minus
2525
s = unicodedata.normalize("NFC", s)
2626
s = re.sub(r"\s+", " ", s).strip()
2727
return s
@@ -30,7 +30,7 @@ def clean_text(s: str) -> str:
3030
def slugify(name: str) -> str:
3131
"""Normalize a name to a URL-safe slug for dict-key matching."""
3232
s = clean_text(name).lower()
33-
s = re.sub(r"[^\w\s-]", "", s)
33+
s = re.sub(r"[^a-zA-Z0-9\s-]", "", s)
3434
s = re.sub(r"\s+", "-", s.strip())
3535
s = re.sub(r"-+", "-", s)
3636
return s
@@ -45,10 +45,10 @@ def parse_cost(s: str) -> dict | None:
4545

4646

4747
def parse_dice(s: str) -> dict | None:
48-
"""Parse '2d6+3' or '1d8-1' (also with en-dash) into {"count": int, "die": int, "bonus": int}."""
49-
# Normalize en-dash to ASCII minus before matching
50-
s = s.replace(_EN_DASH, "-")
51-
m = re.search(r"(\d+)d(\d+)([+-]\d+)?", s)
48+
"""Parse '2d6+3' or '1d8-1' (also with Unicode minus sign U+2212) into {"count": int, "die": int, "bonus": int}."""
49+
# Normalize Unicode minus sign (U+2212) to ASCII minus before matching
50+
s = s.replace(_MINUS_SIGN, "-")
51+
m = re.search(r"(\d+)[dD](\d+)([+-]\d+)?", s)
5252
if not m:
5353
return None
5454
return {

0 commit comments

Comments
 (0)