Skip to content

Commit 154dfd2

Browse files
authored
Merge pull request #4787 from acl-org/python-normalize-and-latex-import
Implement conversion from LaTeX to our Markup XML, reimplementing `bin/latex_to_unicode.py`.
2 parents dbe9a46 + ecaa270 commit 154dfd2

File tree

12 files changed

+590
-18
lines changed

12 files changed

+590
-18
lines changed

python/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
## Unreleased
4+
5+
### Added
6+
7+
- MarkupText can now be instantiated from strings (potentially) containing LaTeX markup.
8+
- This reimplements functionality used at ingestion time previously found in `bin/latex_to_unicode.py`.
9+
310
## [0.5.2] — 2025-05-16
411

512
This release adds support for Python 3.13 and initial functionality for creating new proceedings.

python/acl_anthology/exceptions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-2024 Marcel Bollmann <[email protected]>
1+
# Copyright 2023-2025 Marcel Bollmann <[email protected]>
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
2424
if sys.version_info >= (3, 11):
2525

2626
class AnthologyException(Exception):
27+
"""Base class from which all other exceptions defined here inherit."""
28+
2729
def __init__(self, msg: str):
2830
super().__init__(msg)
2931

python/acl_anthology/text/markuptext.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2019-2024 Marcel Bollmann <[email protected]>
1+
# Copyright 2019-2025 Marcel Bollmann <[email protected]>
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
2424
from xml.sax.saxutils import escape as xml_escape
2525

2626
from ..utils import (
27+
clean_unicode,
2728
latex_encode,
2829
latex_convert_quotes,
30+
parse_latex_to_xml,
2931
remove_extra_whitespace,
3032
stringify_children,
3133
)
@@ -171,16 +173,51 @@ def as_xml(self) -> str:
171173
return self._xml
172174

173175
@classmethod
174-
def from_string(cls, text: str) -> MarkupText:
176+
def from_string(cls, text: str, clean: bool = True) -> MarkupText:
175177
"""
176178
Arguments:
177179
text: A simple text string without any markup.
180+
clean: If True, applies the Anthology's [Unicode normalization][acl_anthology.utils.text.clean_unicode].
178181
179182
Returns:
180183
Instantiated MarkupText object corresponding to the string.
181184
"""
185+
if clean:
186+
return cls(clean_unicode(text))
182187
return cls(text)
183188

189+
@classmethod
190+
def from_latex(cls, text: str, clean: bool = True) -> MarkupText:
191+
"""
192+
Arguments:
193+
text: A text string potentially containing LaTeX markup.
194+
clean: If True, applies the Anthology's [Unicode normalization][acl_anthology.utils.text.clean_unicode].
195+
196+
Returns:
197+
Instantiated MarkupText object corresponding to the string.
198+
"""
199+
if clean:
200+
text = clean_unicode(text)
201+
element = parse_latex_to_xml(text, use_heuristics=False)
202+
return cls.from_xml(element)
203+
204+
@classmethod
205+
def from_latex_maybe(cls, text: str, clean: bool = True) -> MarkupText:
206+
"""
207+
Like `from_latex()`, but can be used if it is unclear if the string is plain text or LaTeX. Will prevent percentage signs being interpreted as LaTeX comments, and apply a heuristic to decide if a tilde is literal or a non-breaking space.
208+
209+
Arguments:
210+
text: A text string potentially in plain text or LaTeX format.
211+
clean: If True, applies the Anthology's [Unicode normalization][acl_anthology.utils.text.clean_unicode].
212+
213+
Returns:
214+
Instantiated MarkupText object corresponding to the string.
215+
"""
216+
if clean:
217+
text = clean_unicode(text)
218+
element = parse_latex_to_xml(text, use_heuristics=True)
219+
return cls.from_xml(element)
220+
184221
@classmethod
185222
def from_xml(cls, element: etree._Element) -> MarkupText:
186223
"""

python/acl_anthology/utils/__init__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-2024 Marcel Bollmann <[email protected]>
1+
# Copyright 2023-2025 Marcel Bollmann <[email protected]>
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -15,22 +15,29 @@
1515
from .citation import citeproc_render_html
1616
from .git import clone_or_pull_from_repo
1717
from .ids import build_id, parse_id, AnthologyID
18-
from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry
18+
from .latex import (
19+
latex_encode,
20+
latex_convert_quotes,
21+
make_bibtex_entry,
22+
parse_latex_to_xml,
23+
)
1924
from .logging import setup_rich_logging, get_logger
20-
from .text import remove_extra_whitespace
25+
from .text import clean_unicode, remove_extra_whitespace
2126
from .xml import stringify_children
2227

2328

2429
__all__ = [
2530
"AnthologyID",
2631
"build_id",
2732
"citeproc_render_html",
33+
"clean_unicode",
2834
"clone_or_pull_from_repo",
2935
"get_logger",
3036
"latex_encode",
3137
"latex_convert_quotes",
3238
"make_bibtex_entry",
3339
"parse_id",
40+
"parse_latex_to_xml",
3441
"remove_extra_whitespace",
3542
"setup_rich_logging",
3643
"stringify_children",

0 commit comments

Comments
 (0)