Skip to content

Fix auto quote-conversion when converting into BibTeX + add tests #5195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions python/acl_anthology/text/markuptext.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def markup_to_latex(element: etree._Element) -> str:
text += latex_encode(nested_element.tail)

text = MARKUP_LATEX_CMDS[tag].format(text=text)
text = latex_convert_quotes(text)
return text


Expand Down Expand Up @@ -153,10 +152,10 @@ def as_latex(self) -> str:
if self._latex is not None:
return self._latex
if isinstance(self._content, str):
latex = latex_convert_quotes(latex_encode(self._content))
latex = latex_encode(self._content)
else:
latex = markup_to_latex(self._content)
self._latex = remove_extra_whitespace(latex)
self._latex = remove_extra_whitespace(latex_convert_quotes(latex))
return self._latex

def as_xml(self) -> str:
Expand Down
40 changes: 36 additions & 4 deletions python/acl_anthology/utils/latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,39 @@
}
"""A mapping of month names to BibTeX macros."""

RE_OPENING_QUOTE_DOUBLE = re.compile(r"(?<!\\)({''}|'')\b")
RE_OPENING_QUOTE_SINGLE = re.compile(r"(?<!\\)({'}|')\b")
RE_CLOSING_QUOTE_DOUBLE = re.compile(r"(?<!\\){''}")
RE_CLOSING_QUOTE_SINGLE = re.compile(r"(?<!\\){'}")
RE_OPENING_QUOTE_DOUBLE = re.compile(
r"""
(\A|(?<=\s)) # must be start of the string or come after whitespace
({''}|'') # match double apostrophe, optionally in braces
(?!}|\s) # must not come before whitespace or closing brace }
""",
re.X,
)
RE_OPENING_QUOTE_SINGLE = re.compile(
r"""
(\A|(?<=\s)) # must be start of the string or come after whitespace
({'}|') # match single apostrophe, optionally in braces
(?!'|}|\s) # must not come before whitespace, closing brace, or another apostrophe
""",
re.X,
)
RE_CLOSING_QUOTE_DOUBLE = re.compile(
r"""
(?<!\\) # must not come after backslash
{''} # match double apostrophe in braces
(?=\W|\Z) # must be end of the string or come before a non-word character
""",
re.X,
)
RE_CLOSING_QUOTE_SINGLE = re.compile(
r"""
(?<!\\) # must not come after backslash
{'} # match single apostrophe in braces
(?=\W|\Z) # must be end of the string or come before a non-word character
""",
re.X,
)

RE_HYPHENS_BETWEEN_NUMBERS = re.compile(r"(?<=[0-9])(-|–|—)(?=[0-9])")


Expand Down Expand Up @@ -138,6 +167,9 @@ def latex_convert_quotes(text: str) -> str:
Returns:
The input string with LaTeX quotes converted into proper opening and closing quotes, removing braces around them, if necessary.

Note:
This is called during the conversion from our XML markup to LaTeX. Straight quotation marks (`"`) will have been converted to double apostrophes, usually in braces (`{''}`), by pylatexenc; this function applies regexes to turn them into appropriate opening/closing quotes with the braces removed.

Examples:
>>> latex_convert_quotes("This {''}great{''} example")
"This ``great'' example"
Expand Down
40 changes: 40 additions & 0 deletions python/tests/text/markuptext_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,47 @@
"latex": "\\textit{D\\textbf{e\\textit{e\\textbf{e\\textit{e\\textbf{p}}}}}}ly",
},
),
( # Apostrophe character gets turned into a regular, protected apostrophe
"BERT’s and <fixed-case>BERT</fixed-case>’s Attention",
{
"text": "BERT’s and BERT’s Attention",
"html": 'BERT’s and <span class="acl-fixed-case">BERT</span>’s Attention',
"latex": "BERT{'}s and {BERT}{'}s Attention",
},
),
( # Regular quotes get turned into LaTeX quotes (and left untouched otherwise)
'This "very normal" assumption',
{
"text": 'This "very normal" assumption',
"html": 'This "very normal" assumption',
"latex": "This ``very normal'' assumption",
},
),
(
'This "very <b>bold</b>" assumption',
{
"text": 'This "very bold" assumption',
"html": 'This "very <b>bold</b>" assumption',
"latex": "This ``very \\textbf{bold}'' assumption",
},
),
( # Typographic quotes get turned into their respective LaTeX commands
"This “very normal” assumption",
{
"text": "This “very normal” assumption",
"html": "This “very normal” assumption",
"latex": "This {\\textquotedblleft}very normal{\\textquotedblright} assumption",
},
),
(
"This “very <b>bold</b>” assumption",
{
"text": "This “very bold” assumption",
"html": "This “very <b>bold</b>” assumption",
"latex": "This {\\textquotedblleft}very \\textbf{bold}{\\textquotedblright} assumption",
},
),
( # Special characters should always be in braces for BibTeX export
"Äöøéÿőßû–",
{
"text": "Äöøéÿőßû–",
Expand Down
13 changes: 11 additions & 2 deletions python/tests/utils/latex_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,24 @@
from acl_anthology.text import MarkupText
from acl_anthology.utils import latex

test_cases_latex = (
# Tests helper function used during conversion of our XML markup to LaTeX.
# Straight quotation marks (") will have been converted to double apostrophes,
# usually in braces ({''}), by pylatexenc; the function tested here applies
# heuristics to turn them into appropriate opening/closing quotes with the
# braces removed.
test_cases_latex_convert_quotes = (
("{''}This is a quotation.{''}", "``This is a quotation.''"),
("''This is a quotation.''", "``This is a quotation.''"),
("This is a {''}quotation{''}.", "This is a ``quotation''."),
("Can you 'please' {'}convert{'} this?", "Can you `please' `convert' this?"),
("My name is ''陳大文''.", "My name is ``陳大文''."),
("This isn't a quotation.", "This isn't a quotation."),
("But ''\\textbf{this}'' is", "But ``\\textbf{this}'' is"),
("But {''}\\textbf{this}{''} is", "But ``\\textbf{this}'' is"),
)


@pytest.mark.parametrize("inp, out", test_cases_latex)
@pytest.mark.parametrize("inp, out", test_cases_latex_convert_quotes)
def test_latex_convert_quotes(inp, out):
assert latex.latex_convert_quotes(inp) == out

Expand Down