From 5c1bdb3ae72efe520484b95e959d3570b68bfede Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 5 Mar 2025 15:58:36 +0100 Subject: [PATCH 1/3] Add first version of LaTeX-to-MarkupXML conversion This reimplements most of `bin/latex_to_unicode.py` within the new library. More tests are needed, and some conversions done in `latex_to_unicode` are still missing. --- python/acl_anthology/text/markuptext.py | 13 ++ python/acl_anthology/utils/__init__.py | 8 +- python/acl_anthology/utils/latex.py | 169 ++++++++++++++++++++++++ python/acl_anthology/utils/xml.py | 24 ++++ python/tests/text/markuptext_test.py | 77 ++++++++++- 5 files changed, 289 insertions(+), 2 deletions(-) diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py index fa52f6fc39..1c724ec7b4 100644 --- a/python/acl_anthology/text/markuptext.py +++ b/python/acl_anthology/text/markuptext.py @@ -26,6 +26,7 @@ from ..utils import ( latex_encode, latex_convert_quotes, + parse_latex_to_xml, remove_extra_whitespace, stringify_children, ) @@ -181,6 +182,18 @@ def from_string(cls, text: str) -> MarkupText: """ return cls(text) + @classmethod + def from_latex(cls, text: str) -> MarkupText: + """ + Arguments: + text: A text string potentially containing LaTeX markup. + + Returns: + Instantiated MarkupText object corresponding to the string. + """ + element = parse_latex_to_xml(text) + return cls.from_xml(element) + @classmethod def from_xml(cls, element: etree._Element) -> MarkupText: """ diff --git a/python/acl_anthology/utils/__init__.py b/python/acl_anthology/utils/__init__.py index b34dac41a5..17ac1cd6c4 100644 --- a/python/acl_anthology/utils/__init__.py +++ b/python/acl_anthology/utils/__init__.py @@ -15,7 +15,12 @@ from .citation import citeproc_render_html from .git import clone_or_pull_from_repo from .ids import build_id, parse_id, AnthologyID -from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry +from .latex import ( + latex_encode, + latex_convert_quotes, + make_bibtex_entry, + parse_latex_to_xml, +) from .logging import setup_rich_logging, get_logger from .text import remove_extra_whitespace from .xml import stringify_children @@ -31,6 +36,7 @@ "latex_convert_quotes", "make_bibtex_entry", "parse_id", + "parse_latex_to_xml", "remove_extra_whitespace", "setup_rich_logging", "stringify_children", diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py index a52ef70615..a146b8ab29 100644 --- a/python/acl_anthology/utils/latex.py +++ b/python/acl_anthology/utils/latex.py @@ -18,6 +18,7 @@ import re from functools import lru_cache +from lxml import etree from typing import cast, Optional, TypeAlias, TYPE_CHECKING if TYPE_CHECKING: @@ -27,12 +28,30 @@ SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification] """Any type that can be supplied to `make_bibtex_entry`.""" +from .logging import get_logger +from .xml import append_text from pylatexenc.latexencode import ( UnicodeToLatexEncoder, UnicodeToLatexConversionRule, RULE_DICT, ) +from pylatexenc.latexwalker import ( + LatexWalker, + LatexNode, + LatexCharsNode, + LatexGroupNode, + LatexMacroNode, + LatexMathNode, + LatexSpecialsNode, +) +from pylatexenc.latex2text import LatexNodes2Text + +log = get_logger() + +################################################################################ +### UNICODE TO LATEX (BIBTEX) +################################################################################ LATEXENC = UnicodeToLatexEncoder( conversion_rules=[ @@ -54,6 +73,7 @@ unknown_char_policy="keep", unknown_char_warning=False, ) +"""A UnicodeToLatexEncoder instance intended for BibTeX generation.""" BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"} """Any BibTeX field whose value should be LaTeX-encoded first.""" @@ -211,3 +231,152 @@ def namespecs_to_bibtex(namespecs: list[NameSpecification]) -> str: A BibTeX-formatted string representing the given names. """ return " and\n ".join(spec.name.as_bibtex() for spec in namespecs) + + +################################################################################ +### LATEX TO UNICODE/XML +################################################################################ + +LATEX_MACRO_TO_XMLTAG = { + "emph": "i", + "em": "i", + "textit": "i", + "it": "i", + "textsl": "i", + "sl": "i", + "textbf": "b", + "bf": "b", + "url": "url", +} +LATEX_TO_TEXT = LatexNodes2Text( + strict_latex_spaces=True, +) + + +def _is_trivial_math(node: LatexMathNode) -> bool: + """Helper function to determine whether or not a LatexMathNode contains only 'trivial' content that doesn't require a node.""" + content = node.latex_verbatim().strip("$").replace(r"\%", "%") + return all(c.isspace() or c.isdigit() or c in (".,@%~") for c in content) + + +def _should_parse_macro_as_text(node: LatexMacroNode) -> bool: + """Helper function to determine whether or not a LatexMacroNode should be parsed as a simple character macro.""" + subnodes = node.nodeargd.argnlist + if len(subnodes) == 0: + # Macro without arguments; e.g. \i or \l + return True + elif len(subnodes) > 1: + # Macro with more than one argument + return False + subnode = subnodes[0] + if subnode.isNodeType(LatexCharsNode) and subnode.len == 1: + return True + if ( + subnode.isNodeType(LatexGroupNode) + and len(subnode.nodelist) == 1 + and subnode.nodelist[0].isNodeType(LatexCharsNode) + and subnode.nodelist[0].len == 1 + ): + return True + return False + + +def _should_wrap_in_fixed_case(node: LatexGroupNode) -> bool: + """Helper function to determine whether or not a LatexGroupNode should produce a tag.""" + if len(node.nodelist) == 0 or node.delimiters != ("{", "}"): + return False + if node.latex_verbatim().startswith("{\\"): + # {\...} does *not* protect case + return False + if node.nodelist[0].isNodeType(LatexMathNode): + # Don't mark {$...$} + return False + if node.nodelist[0].isNodeType(LatexSpecialsNode): + # Don't mark {``}, {--}, etc. + return False + return True + + +def _parse_nodelist_to_element( + nodelist: list[LatexNode], + element: etree._Element, + use_fixed_case: bool, + in_macro: bool = False, +) -> None: + """Parse a list of LaTeX nodes into an XML element using the Anthology markup format. + + Arguments: + nodelist: The list of parsed LaTeX nodes. + element: An XML element into which the parsed nodes will be added. + use_fixed_case: Flag indicating whether protection should be applied. + in_macro: Flag indicating whether this function was called by recursing into a macro node. (Do not set this manually.) + + Returns: + None; the XML element is modified in-place. + """ + for node in nodelist: + if node.isNodeType(LatexCharsNode): + # Plain text + append_text(element, node.chars) + elif node.isNodeType(LatexMacroNode): + # LaTeX macro + if (tag := LATEX_MACRO_TO_XMLTAG.get(node.macroname)) is not None: + # This macro should get its own XML tag (e.g. \textbf -> ) + subelem = etree.SubElement(element, tag) + subnodes = node.nodeargd.argnlist + _parse_nodelist_to_element( + subnodes, subelem, use_fixed_case, in_macro=True + ) + elif _should_parse_macro_as_text(node): + # This macro should be parsed as text because it probably + # represents a special character, such as \v{c} or \"I + append_text(element, LATEX_TO_TEXT.macro_node_to_text(node)) + else: + # This is a macro we don't know how to handle - emit warning, + # then discard macro but recurse into its children + log.warning(f"Unhandled LaTeX macro '{node.macroname}'") + subnodes = node.nodeargd.argnlist + _parse_nodelist_to_element( + subnodes, subelem, use_fixed_case, in_macro=True + ) + elif node.isNodeType(LatexGroupNode): + # Bracketed group, such as {...} or [...] + if not in_macro and _should_wrap_in_fixed_case(node): + # Protect this with , then recurse + subelem = etree.SubElement(element, "fixed-case") + _parse_nodelist_to_element(node.nodelist, subelem, False) + else: + # Just recurse + _parse_nodelist_to_element(node.nodelist, element, use_fixed_case) + elif node.isNodeType(LatexMathNode): + # Math node + if _is_trivial_math(node): + # Just append as text + append_text(element, LATEX_TO_TEXT.math_node_to_text(node)) + else: + # Keep verbatim, but wrap in + subelem = etree.SubElement(element, "tex-math") + subelem.text = node.latex_verbatim().strip("$") + elif node.isNodeType(LatexSpecialsNode): + # TODO: Is this always the correct way? + append_text(element, LATEX_TO_TEXT.specials_node_to_text(node)) + else: + # Comments or environments + log.warning(f"Unhandled node type: {node.nodeType}") + + +def parse_latex_to_xml(latex_input: str, use_fixed_case: bool = True) -> etree._Element: + """Convert a string with LaTeX markup into the Anthology XML format. + + Arguments: + latex_input: A string potentially including LaTeX markup. + use_fixed_case: Flag indicating whether protection should be applied. + + Returns: + An XML element representing the given LaTeX input in the Anthology XML format for markup strings. + """ + element = etree.Element("root") + walker = LatexWalker(latex_input) + nodelist, *_ = walker.get_latex_nodes() + _parse_nodelist_to_element(nodelist, element, use_fixed_case) + return element diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py index dd9c7b8ca1..3f771dc013 100644 --- a/python/acl_anthology/utils/xml.py +++ b/python/acl_anthology/utils/xml.py @@ -81,6 +81,30 @@ def assert_equals(elem: etree._Element, other: etree._Element) -> None: assert_equals(elem_child, other_child) +def append_text(elem: etree._Element, text: str) -> None: + """Append text to an XML element. + + If the XML element has children, the text will be appended to the tail of the last child; otherwise, it will be appended to its text attribute. + + Arguments: + elem: The XML element. + text: The text string to append to the XML element. + + Returns: + None; the XML element is modified in-place. + """ + if len(elem): + # already has children — append text to tail + if elem[-1].tail is not None: + elem[-1].tail = "".join((elem[-1].tail, text)) + else: + elem[-1].tail = text + elif elem.text is not None: + elem.text = "".join((elem.text, text)) + else: + elem.text = text + + def clean_whitespace( text: Optional[str], func: Optional[Callable[[str], str]] = None ) -> Optional[str]: diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index 5132e75a8b..d872453067 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -157,7 +157,7 @@ @pytest.mark.parametrize("inp, out", test_cases_markup) -def test_markup(inp, out): +def test_markup_from_xml(inp, out): xml = f"{inp}" element = etree.fromstring(xml) markup = MarkupText.from_xml(element) @@ -181,3 +181,78 @@ def test_simple_string(): etree.tostring(markup.to_xml("span"), encoding="unicode") == f"{text}" ) + + +test_cases_markup_from_latex = ( + ("", ""), + ( + "{A}dap{L}e{R}: Speeding up Inference by Adaptive Length Reduction", + "AdapLeR: Speeding up Inference by Adaptive Length Reduction", + ), + ( + "\\textbf{D}ynamic \\textbf{S}chema \\textbf{G}raph \\textbf{F}usion \\textbf{Net}work (\\textbf{DSGFNet})", + "Dynamic Schema Graph Fusion Network (DSGFNet)", + ), + ( + "selecting prompt templates \\textit{without labeled examples} and \\emph{without direct access to the model}.", + "selecting prompt templates without labeled examples and without direct access to the model.", + ), + ( + "$^{\\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity", + "^{\\mathcal{E}}: a Vectorial Resource for Computing Conceptual Similarity", + ), + ( + "The source code will be available at \\url{https://github.com/zhang-yu-wei/MTP-CLNN}.", + "The source code will be available at https://github.com/zhang-yu-wei/MTP-CLNN.", + ), + ( + "Workshop on Topic A {\\&} B", + "Workshop on Topic A & B", + ), + ( + "{U}pstream {M}itigation {I}s \\textit{{N}ot} {A}ll {Y}ou {N}eed", + "Upstream Mitigation Is Not All You Need", + ), + ( + "\\textbf{Con\\textit{trived}} {\\textbf{Ex}AMP\\textit{L}e} of N\\textbf{es$_{te}$d} markup", + "Contrived ExAMPLe of Nes_{te}d markup", + ), + ( + "\\textit{D\\textbf{e\\textit{e\\textbf{e\\textit{e\\textbf{p}}}}}}ly", + "Deeeeply", + ), + ( + '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}', + "Äöøéÿőßû–", + ), + ( + "Haji{\\v{c}}, Jan and Wo{\\'z}niak, Micha{\\l}", + "Hajič, Jan and Woźniak, Michał", + ), + ( + "{\\v{Z}}abokrtsk{\\'y}, Zden{\\v{e}}k and {\\v{S}}ev{\\v{c}}{\\'i}kov{\\'a}, Magda", + "Žabokrtský, Zdeněk and Ševčíková, Magda", + ), + ( + "{\\'i}{\\`i}{\\\"i}{\\^i}{\\i} {\\'I}{\\`I}{\\\"I}{\\^I}{\\.I}", + "íìïîı ÍÌÏÎİ", + ), + ( + "陳大文", + "陳大文", + ), + ( + "A $4.9\\%$ increase", + "A 4.9% increase", + ), + ( + "A $\\log 25$ increase", + "A \\log 25 increase", + ), +) + + +@pytest.mark.parametrize("inp, out", test_cases_markup_from_latex) +def test_markup_from_latex(inp, out): + markup = MarkupText.from_latex(inp) + assert markup.as_xml() == out From 68cb2d0e753c1ce15e64f74ed837ac150aa21126 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 6 Mar 2025 10:49:18 +0100 Subject: [PATCH 2/3] Test and fix bug for unhandled LaTeX commands --- python/acl_anthology/utils/latex.py | 2 +- python/tests/text/markuptext_test.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py index a146b8ab29..a960cc2ff1 100644 --- a/python/acl_anthology/utils/latex.py +++ b/python/acl_anthology/utils/latex.py @@ -337,7 +337,7 @@ def _parse_nodelist_to_element( log.warning(f"Unhandled LaTeX macro '{node.macroname}'") subnodes = node.nodeargd.argnlist _parse_nodelist_to_element( - subnodes, subelem, use_fixed_case, in_macro=True + subnodes, element, use_fixed_case, in_macro=True ) elif node.isNodeType(LatexGroupNode): # Bracketed group, such as {...} or [...] diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index d872453067..ccee91274e 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -249,6 +249,10 @@ def test_simple_string(): "A $\\log 25$ increase", "A \\log 25 increase", ), + ( + "An \\textsc{unhandled} command", + "An unhandled command", + ), ) From 28e6a0b845a329c8567e1f6deb66c8b4856657e5 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 6 Mar 2025 11:06:39 +0100 Subject: [PATCH 3/3] Map citations onto "(CITATION)" --- python/acl_anthology/utils/latex.py | 31 ++++++++++++++++++++++------ python/tests/text/markuptext_test.py | 4 ++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py index a960cc2ff1..3f31f5fc27 100644 --- a/python/acl_anthology/utils/latex.py +++ b/python/acl_anthology/utils/latex.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Functions implementing the conversion to LaTeX/BibTeX formats.""" +"""Functions implementing conversions to and from LaTeX/BibTeX formats.""" from __future__ import annotations @@ -45,7 +45,11 @@ LatexMathNode, LatexSpecialsNode, ) -from pylatexenc.latex2text import LatexNodes2Text +from pylatexenc.latex2text import ( + LatexNodes2Text, + MacroTextSpec, + get_default_latex_context_db, +) log = get_logger() @@ -248,9 +252,16 @@ def namespecs_to_bibtex(namespecs: list[NameSpecification]) -> str: "bf": "b", "url": "url", } -LATEX_TO_TEXT = LatexNodes2Text( - strict_latex_spaces=True, +LATEX_CITE_MACROS = {"cite", "citep", "citet", "newcite", "citeauthor", "citeyear"} +L2T_CONTEXT = get_default_latex_context_db() +L2T_CONTEXT.add_context_category( + "citations", + prepend=True, + macros=[ + MacroTextSpec(macro, simplify_repl=r"(CITATION)") for macro in LATEX_CITE_MACROS + ], ) +LATEX_TO_TEXT = LatexNodes2Text(strict_latex_spaces=True, latex_context=L2T_CONTEXT) def _is_trivial_math(node: LatexMathNode) -> bool: @@ -315,7 +326,9 @@ def _parse_nodelist_to_element( None; the XML element is modified in-place. """ for node in nodelist: - if node.isNodeType(LatexCharsNode): + if node is None: + continue + elif node.isNodeType(LatexCharsNode): # Plain text append_text(element, node.chars) elif node.isNodeType(LatexMacroNode): @@ -327,6 +340,9 @@ def _parse_nodelist_to_element( _parse_nodelist_to_element( subnodes, subelem, use_fixed_case, in_macro=True ) + elif node.macroname in LATEX_CITE_MACROS: + # A citation command such as \cite{...} + append_text(element, LATEX_TO_TEXT.macro_node_to_text(node)) elif _should_parse_macro_as_text(node): # This macro should be parsed as text because it probably # represents a special character, such as \v{c} or \"I @@ -345,9 +361,12 @@ def _parse_nodelist_to_element( # Protect this with , then recurse subelem = etree.SubElement(element, "fixed-case") _parse_nodelist_to_element(node.nodelist, subelem, False) - else: + elif node.delimiters == ("{", "}"): # Just recurse _parse_nodelist_to_element(node.nodelist, element, use_fixed_case) + else: + # Skip [...] or <...> groups + pass elif node.isNodeType(LatexMathNode): # Math node if _is_trivial_math(node): diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index ccee91274e..8b9f36f17f 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -253,6 +253,10 @@ def test_simple_string(): "An \\textsc{unhandled} command", "An unhandled command", ), + ( + "A citation \\cite[p.32]{doe-et-al-2024}", + "A citation (CITATION)", + ), )