Skip to content

Commit 0c653a8

Browse files
authored
Fix auto quote-conversion when converting into BibTeX + add tests (#5195)
1 parent c65663a commit 0c653a8

File tree

4 files changed

+89
-9
lines changed

4 files changed

+89
-9
lines changed

python/acl_anthology/text/markuptext.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ def markup_to_latex(element: etree._Element) -> str:
5757
text += latex_encode(nested_element.tail)
5858

5959
text = MARKUP_LATEX_CMDS[tag].format(text=text)
60-
text = latex_convert_quotes(text)
6160
return text
6261

6362

@@ -153,10 +152,10 @@ def as_latex(self) -> str:
153152
if self._latex is not None:
154153
return self._latex
155154
if isinstance(self._content, str):
156-
latex = latex_convert_quotes(latex_encode(self._content))
155+
latex = latex_encode(self._content)
157156
else:
158157
latex = markup_to_latex(self._content)
159-
self._latex = remove_extra_whitespace(latex)
158+
self._latex = remove_extra_whitespace(latex_convert_quotes(latex))
160159
return self._latex
161160

162161
def as_xml(self) -> str:

python/acl_anthology/utils/latex.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,39 @@
7272
}
7373
"""A mapping of month names to BibTeX macros."""
7474

75-
RE_OPENING_QUOTE_DOUBLE = re.compile(r"(?<!\\)({''}|'')\b")
76-
RE_OPENING_QUOTE_SINGLE = re.compile(r"(?<!\\)({'}|')\b")
77-
RE_CLOSING_QUOTE_DOUBLE = re.compile(r"(?<!\\){''}")
78-
RE_CLOSING_QUOTE_SINGLE = re.compile(r"(?<!\\){'}")
75+
RE_OPENING_QUOTE_DOUBLE = re.compile(
76+
r"""
77+
(\A|(?<=\s)) # must be start of the string or come after whitespace
78+
({''}|'') # match double apostrophe, optionally in braces
79+
(?!}|\s) # must not come before whitespace or closing brace }
80+
""",
81+
re.X,
82+
)
83+
RE_OPENING_QUOTE_SINGLE = re.compile(
84+
r"""
85+
(\A|(?<=\s)) # must be start of the string or come after whitespace
86+
({'}|') # match single apostrophe, optionally in braces
87+
(?!'|}|\s) # must not come before whitespace, closing brace, or another apostrophe
88+
""",
89+
re.X,
90+
)
91+
RE_CLOSING_QUOTE_DOUBLE = re.compile(
92+
r"""
93+
(?<!\\) # must not come after backslash
94+
{''} # match double apostrophe in braces
95+
(?=\W|\Z) # must be end of the string or come before a non-word character
96+
""",
97+
re.X,
98+
)
99+
RE_CLOSING_QUOTE_SINGLE = re.compile(
100+
r"""
101+
(?<!\\) # must not come after backslash
102+
{'} # match single apostrophe in braces
103+
(?=\W|\Z) # must be end of the string or come before a non-word character
104+
""",
105+
re.X,
106+
)
107+
79108
RE_HYPHENS_BETWEEN_NUMBERS = re.compile(r"(?<=[0-9])(-|–|—)(?=[0-9])")
80109

81110

@@ -138,6 +167,9 @@ def latex_convert_quotes(text: str) -> str:
138167
Returns:
139168
The input string with LaTeX quotes converted into proper opening and closing quotes, removing braces around them, if necessary.
140169
170+
Note:
171+
This is called during the conversion from our XML markup to LaTeX. Straight quotation marks (`"`) will have been converted to double apostrophes, usually in braces (`{''}`), by pylatexenc; this function applies regexes to turn them into appropriate opening/closing quotes with the braces removed.
172+
141173
Examples:
142174
>>> latex_convert_quotes("This {''}great{''} example")
143175
"This ``great'' example"

python/tests/text/markuptext_test.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,47 @@
105105
"latex": "\\textit{D\\textbf{e\\textit{e\\textbf{e\\textit{e\\textbf{p}}}}}}ly",
106106
},
107107
),
108+
( # Apostrophe character gets turned into a regular, protected apostrophe
109+
"BERT’s and <fixed-case>BERT</fixed-case>’s Attention",
110+
{
111+
"text": "BERT’s and BERT’s Attention",
112+
"html": 'BERT’s and <span class="acl-fixed-case">BERT</span>’s Attention',
113+
"latex": "BERT{'}s and {BERT}{'}s Attention",
114+
},
115+
),
116+
( # Regular quotes get turned into LaTeX quotes (and left untouched otherwise)
117+
'This "very normal" assumption',
118+
{
119+
"text": 'This "very normal" assumption',
120+
"html": 'This "very normal" assumption',
121+
"latex": "This ``very normal'' assumption",
122+
},
123+
),
124+
(
125+
'This "very <b>bold</b>" assumption',
126+
{
127+
"text": 'This "very bold" assumption',
128+
"html": 'This "very <b>bold</b>" assumption',
129+
"latex": "This ``very \\textbf{bold}'' assumption",
130+
},
131+
),
132+
( # Typographic quotes get turned into their respective LaTeX commands
133+
"This “very normal” assumption",
134+
{
135+
"text": "This “very normal” assumption",
136+
"html": "This “very normal” assumption",
137+
"latex": "This {\\textquotedblleft}very normal{\\textquotedblright} assumption",
138+
},
139+
),
108140
(
141+
"This “very <b>bold</b>” assumption",
142+
{
143+
"text": "This “very bold” assumption",
144+
"html": "This “very <b>bold</b>” assumption",
145+
"latex": "This {\\textquotedblleft}very \\textbf{bold}{\\textquotedblright} assumption",
146+
},
147+
),
148+
( # Special characters should always be in braces for BibTeX export
109149
"Äöøéÿőßû–",
110150
{
111151
"text": "Äöøéÿőßû–",

python/tests/utils/latex_test.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,24 @@
1717
from acl_anthology.text import MarkupText
1818
from acl_anthology.utils import latex
1919

20-
test_cases_latex = (
20+
# Tests helper function used during conversion of our XML markup to LaTeX.
21+
# Straight quotation marks (") will have been converted to double apostrophes,
22+
# usually in braces ({''}), by pylatexenc; the function tested here applies
23+
# heuristics to turn them into appropriate opening/closing quotes with the
24+
# braces removed.
25+
test_cases_latex_convert_quotes = (
2126
("{''}This is a quotation.{''}", "``This is a quotation.''"),
27+
("''This is a quotation.''", "``This is a quotation.''"),
2228
("This is a {''}quotation{''}.", "This is a ``quotation''."),
2329
("Can you 'please' {'}convert{'} this?", "Can you `please' `convert' this?"),
2430
("My name is ''陳大文''.", "My name is ``陳大文''."),
31+
("This isn't a quotation.", "This isn't a quotation."),
32+
("But ''\\textbf{this}'' is", "But ``\\textbf{this}'' is"),
33+
("But {''}\\textbf{this}{''} is", "But ``\\textbf{this}'' is"),
2534
)
2635

2736

28-
@pytest.mark.parametrize("inp, out", test_cases_latex)
37+
@pytest.mark.parametrize("inp, out", test_cases_latex_convert_quotes)
2938
def test_latex_convert_quotes(inp, out):
3039
assert latex.latex_convert_quotes(inp) == out
3140

0 commit comments

Comments
 (0)