From 5c1bdb3ae72efe520484b95e959d3570b68bfede Mon Sep 17 00:00:00 2001
From: Marcel Bollmann <marcel@bollmann.me>
Date: Wed, 5 Mar 2025 15:58:36 +0100
Subject: [PATCH 1/3] Add first version of LaTeX-to-MarkupXML conversion

This reimplements most of `bin/latex_to_unicode.py` within the new library.
More tests are needed, and some conversions done in `latex_to_unicode` are still missing.
---
 python/acl_anthology/text/markuptext.py |  13 ++
 python/acl_anthology/utils/__init__.py  |   8 +-
 python/acl_anthology/utils/latex.py     | 169 ++++++++++++++++++++++++
 python/acl_anthology/utils/xml.py       |  24 ++++
 python/tests/text/markuptext_test.py    |  77 ++++++++++-
 5 files changed, 289 insertions(+), 2 deletions(-)

diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py
index fa52f6fc39..1c724ec7b4 100644
--- a/python/acl_anthology/text/markuptext.py
+++ b/python/acl_anthology/text/markuptext.py
@@ -26,6 +26,7 @@
 from ..utils import (
     latex_encode,
     latex_convert_quotes,
+    parse_latex_to_xml,
     remove_extra_whitespace,
     stringify_children,
 )
@@ -181,6 +182,18 @@ def from_string(cls, text: str) -> MarkupText:
         """
         return cls(text)
 
+    @classmethod
+    def from_latex(cls, text: str) -> MarkupText:
+        """
+        Arguments:
+            text: A text string potentially containing LaTeX markup.
+
+        Returns:
+            Instantiated MarkupText object corresponding to the string.
+        """
+        element = parse_latex_to_xml(text)
+        return cls.from_xml(element)
+
     @classmethod
     def from_xml(cls, element: etree._Element) -> MarkupText:
         """
diff --git a/python/acl_anthology/utils/__init__.py b/python/acl_anthology/utils/__init__.py
index b34dac41a5..17ac1cd6c4 100644
--- a/python/acl_anthology/utils/__init__.py
+++ b/python/acl_anthology/utils/__init__.py
@@ -15,7 +15,12 @@
 from .citation import citeproc_render_html
 from .git import clone_or_pull_from_repo
 from .ids import build_id, parse_id, AnthologyID
-from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry
+from .latex import (
+    latex_encode,
+    latex_convert_quotes,
+    make_bibtex_entry,
+    parse_latex_to_xml,
+)
 from .logging import setup_rich_logging, get_logger
 from .text import remove_extra_whitespace
 from .xml import stringify_children
@@ -31,6 +36,7 @@
     "latex_convert_quotes",
     "make_bibtex_entry",
     "parse_id",
+    "parse_latex_to_xml",
     "remove_extra_whitespace",
     "setup_rich_logging",
     "stringify_children",
diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py
index a52ef70615..a146b8ab29 100644
--- a/python/acl_anthology/utils/latex.py
+++ b/python/acl_anthology/utils/latex.py
@@ -18,6 +18,7 @@
 
 import re
 from functools import lru_cache
+from lxml import etree
 from typing import cast, Optional, TypeAlias, TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -27,12 +28,30 @@
     SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification]
     """Any type that can be supplied to `make_bibtex_entry`."""
 
+from .logging import get_logger
+from .xml import append_text
 
 from pylatexenc.latexencode import (
     UnicodeToLatexEncoder,
     UnicodeToLatexConversionRule,
     RULE_DICT,
 )
+from pylatexenc.latexwalker import (
+    LatexWalker,
+    LatexNode,
+    LatexCharsNode,
+    LatexGroupNode,
+    LatexMacroNode,
+    LatexMathNode,
+    LatexSpecialsNode,
+)
+from pylatexenc.latex2text import LatexNodes2Text
+
+log = get_logger()
+
+################################################################################
+### UNICODE TO LATEX (BIBTEX)
+################################################################################
 
 LATEXENC = UnicodeToLatexEncoder(
     conversion_rules=[
@@ -54,6 +73,7 @@
     unknown_char_policy="keep",
     unknown_char_warning=False,
 )
+"""A UnicodeToLatexEncoder instance intended for BibTeX generation."""
 
 BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"}
 """Any BibTeX field whose value should be LaTeX-encoded first."""
@@ -211,3 +231,152 @@ def namespecs_to_bibtex(namespecs: list[NameSpecification]) -> str:
         A BibTeX-formatted string representing the given names.
     """
     return "  and\n      ".join(spec.name.as_bibtex() for spec in namespecs)
+
+
+################################################################################
+### LATEX TO UNICODE/XML
+################################################################################
+
+LATEX_MACRO_TO_XMLTAG = {
+    "emph": "i",
+    "em": "i",
+    "textit": "i",
+    "it": "i",
+    "textsl": "i",
+    "sl": "i",
+    "textbf": "b",
+    "bf": "b",
+    "url": "url",
+}
+LATEX_TO_TEXT = LatexNodes2Text(
+    strict_latex_spaces=True,
+)
+
+
+def _is_trivial_math(node: LatexMathNode) -> bool:
+    """Helper function to determine whether or not a LatexMathNode contains only 'trivial' content that doesn't require a <tex-math> node."""
+    content = node.latex_verbatim().strip("$").replace(r"\%", "%")
+    return all(c.isspace() or c.isdigit() or c in (".,@%~") for c in content)
+
+
+def _should_parse_macro_as_text(node: LatexMacroNode) -> bool:
+    """Helper function to determine whether or not a LatexMacroNode should be parsed as a simple character macro."""
+    subnodes = node.nodeargd.argnlist
+    if len(subnodes) == 0:
+        # Macro without arguments; e.g. \i or \l
+        return True
+    elif len(subnodes) > 1:
+        # Macro with more than one argument
+        return False
+    subnode = subnodes[0]
+    if subnode.isNodeType(LatexCharsNode) and subnode.len == 1:
+        return True
+    if (
+        subnode.isNodeType(LatexGroupNode)
+        and len(subnode.nodelist) == 1
+        and subnode.nodelist[0].isNodeType(LatexCharsNode)
+        and subnode.nodelist[0].len == 1
+    ):
+        return True
+    return False
+
+
+def _should_wrap_in_fixed_case(node: LatexGroupNode) -> bool:
+    """Helper function to determine whether or not a LatexGroupNode should produce a <fixed-case> tag."""
+    if len(node.nodelist) == 0 or node.delimiters != ("{", "}"):
+        return False
+    if node.latex_verbatim().startswith("{\\"):
+        # {\...} does *not* protect case
+        return False
+    if node.nodelist[0].isNodeType(LatexMathNode):
+        # Don't mark {$...$}
+        return False
+    if node.nodelist[0].isNodeType(LatexSpecialsNode):
+        # Don't mark {``}, {--}, etc.
+        return False
+    return True
+
+
+def _parse_nodelist_to_element(
+    nodelist: list[LatexNode],
+    element: etree._Element,
+    use_fixed_case: bool,
+    in_macro: bool = False,
+) -> None:
+    """Parse a list of LaTeX nodes into an XML element using the Anthology markup format.
+
+    Arguments:
+        nodelist: The list of parsed LaTeX nodes.
+        element: An XML element into which the parsed nodes will be added.
+        use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.
+        in_macro: Flag indicating whether this function was called by recursing into a macro node. (Do not set this manually.)
+
+    Returns:
+        None; the XML element is modified in-place.
+    """
+    for node in nodelist:
+        if node.isNodeType(LatexCharsNode):
+            # Plain text
+            append_text(element, node.chars)
+        elif node.isNodeType(LatexMacroNode):
+            # LaTeX macro
+            if (tag := LATEX_MACRO_TO_XMLTAG.get(node.macroname)) is not None:
+                # This macro should get its own XML tag (e.g. \textbf -> <b>)
+                subelem = etree.SubElement(element, tag)
+                subnodes = node.nodeargd.argnlist
+                _parse_nodelist_to_element(
+                    subnodes, subelem, use_fixed_case, in_macro=True
+                )
+            elif _should_parse_macro_as_text(node):
+                # This macro should be parsed as text because it probably
+                # represents a special character, such as \v{c} or \"I
+                append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
+            else:
+                # This is a macro we don't know how to handle - emit warning,
+                # then discard macro but recurse into its children
+                log.warning(f"Unhandled LaTeX macro '{node.macroname}'")
+                subnodes = node.nodeargd.argnlist
+                _parse_nodelist_to_element(
+                    subnodes, subelem, use_fixed_case, in_macro=True
+                )
+        elif node.isNodeType(LatexGroupNode):
+            # Bracketed group, such as {...} or [...]
+            if not in_macro and _should_wrap_in_fixed_case(node):
+                # Protect this with <fixed-case>, then recurse
+                subelem = etree.SubElement(element, "fixed-case")
+                _parse_nodelist_to_element(node.nodelist, subelem, False)
+            else:
+                # Just recurse
+                _parse_nodelist_to_element(node.nodelist, element, use_fixed_case)
+        elif node.isNodeType(LatexMathNode):
+            # Math node
+            if _is_trivial_math(node):
+                # Just append as text
+                append_text(element, LATEX_TO_TEXT.math_node_to_text(node))
+            else:
+                # Keep verbatim, but wrap in <tex-math>
+                subelem = etree.SubElement(element, "tex-math")
+                subelem.text = node.latex_verbatim().strip("$")
+        elif node.isNodeType(LatexSpecialsNode):
+            # TODO: Is this always the correct way?
+            append_text(element, LATEX_TO_TEXT.specials_node_to_text(node))
+        else:
+            # Comments or environments
+            log.warning(f"Unhandled node type: {node.nodeType}")
+
+
+def parse_latex_to_xml(latex_input: str, use_fixed_case: bool = True) -> etree._Element:
+    """Convert a string with LaTeX markup into the Anthology XML format.
+
+    Arguments:
+        latex_input: A string potentially including LaTeX markup.
+        use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.
+
+    Returns:
+        An XML element representing the given LaTeX input in the Anthology XML format for markup strings.
+    """
+    element = etree.Element("root")
+    walker = LatexWalker(latex_input)
+    nodelist, *_ = walker.get_latex_nodes()
+    _parse_nodelist_to_element(nodelist, element, use_fixed_case)
+    return element
diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py
index dd9c7b8ca1..3f771dc013 100644
--- a/python/acl_anthology/utils/xml.py
+++ b/python/acl_anthology/utils/xml.py
@@ -81,6 +81,30 @@ def assert_equals(elem: etree._Element, other: etree._Element) -> None:
             assert_equals(elem_child, other_child)
 
 
+def append_text(elem: etree._Element, text: str) -> None:
+    """Append text to an XML element.
+
+    If the XML element has children, the text will be appended to the tail of the last child; otherwise, it will be appended to its text attribute.
+
+    Arguments:
+        elem: The XML element.
+        text: The text string to append to the XML element.
+
+    Returns:
+        None; the XML element is modified in-place.
+    """
+    if len(elem):
+        # already has children — append text to tail
+        if elem[-1].tail is not None:
+            elem[-1].tail = "".join((elem[-1].tail, text))
+        else:
+            elem[-1].tail = text
+    elif elem.text is not None:
+        elem.text = "".join((elem.text, text))
+    else:
+        elem.text = text
+
+
 def clean_whitespace(
     text: Optional[str], func: Optional[Callable[[str], str]] = None
 ) -> Optional[str]:
diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py
index 5132e75a8b..d872453067 100644
--- a/python/tests/text/markuptext_test.py
+++ b/python/tests/text/markuptext_test.py
@@ -157,7 +157,7 @@
 
 
 @pytest.mark.parametrize("inp, out", test_cases_markup)
-def test_markup(inp, out):
+def test_markup_from_xml(inp, out):
     xml = f"<title>{inp}</title>"
     element = etree.fromstring(xml)
     markup = MarkupText.from_xml(element)
@@ -181,3 +181,78 @@ def test_simple_string():
         etree.tostring(markup.to_xml("span"), encoding="unicode")
         == f"<span>{text}</span>"
     )
+
+
+test_cases_markup_from_latex = (
+    ("", ""),
+    (
+        "{A}dap{L}e{R}: Speeding up Inference by Adaptive Length Reduction",
+        "<fixed-case>A</fixed-case>dap<fixed-case>L</fixed-case>e<fixed-case>R</fixed-case>: Speeding up Inference by Adaptive Length Reduction",
+    ),
+    (
+        "\\textbf{D}ynamic \\textbf{S}chema \\textbf{G}raph \\textbf{F}usion \\textbf{Net}work (\\textbf{DSGFNet})",
+        "<b>D</b>ynamic <b>S</b>chema <b>G</b>raph <b>F</b>usion <b>Net</b>work (<b>DSGFNet</b>)",
+    ),
+    (
+        "selecting prompt templates \\textit{without labeled examples} and \\emph{without direct access to the model}.",
+        "selecting prompt templates <i>without labeled examples</i> and <i>without direct access to the model</i>.",
+    ),
+    (
+        "$^{\\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity",
+        "<tex-math>^{\\mathcal{E}}</tex-math>: a Vectorial Resource for Computing Conceptual Similarity",
+    ),
+    (
+        "The source code will be available at \\url{https://github.com/zhang-yu-wei/MTP-CLNN}.",
+        "The source code will be available at <url>https://github.com/zhang-yu-wei/MTP-CLNN</url>.",
+    ),
+    (
+        "Workshop on Topic A {\\&} B",
+        "Workshop on Topic A &amp; B",
+    ),
+    (
+        "{U}pstream {M}itigation {I}s \\textit{{N}ot} {A}ll {Y}ou {N}eed",
+        "<fixed-case>U</fixed-case>pstream <fixed-case>M</fixed-case>itigation <fixed-case>I</fixed-case>s <i><fixed-case>N</fixed-case>ot</i> <fixed-case>A</fixed-case>ll <fixed-case>Y</fixed-case>ou <fixed-case>N</fixed-case>eed",
+    ),
+    (
+        "\\textbf{Con\\textit{trived}} {\\textbf{Ex}AMP\\textit{L}e} of N\\textbf{es$_{te}$d} markup",
+        "<b>Con<i>trived</i></b> <b>Ex</b>AMP<i>L</i>e of N<b>es<tex-math>_{te}</tex-math>d</b> markup",
+    ),
+    (
+        "\\textit{D\\textbf{e\\textit{e\\textbf{e\\textit{e\\textbf{p}}}}}}ly",
+        "<i>D<b>e<i>e<b>e<i>e<b>p</b></i></b></i></b></i>ly",
+    ),
+    (
+        '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}',
+        "Äöøéÿőßû–",
+    ),
+    (
+        "Haji{\\v{c}}, Jan and Wo{\\'z}niak, Micha{\\l}",
+        "Hajič, Jan and Woźniak, Michał",
+    ),
+    (
+        "{\\v{Z}}abokrtsk{\\'y}, Zden{\\v{e}}k and {\\v{S}}ev{\\v{c}}{\\'i}kov{\\'a}, Magda",
+        "Žabokrtský, Zdeněk and Ševčíková, Magda",
+    ),
+    (
+        "{\\'i}{\\`i}{\\\"i}{\\^i}{\\i} {\\'I}{\\`I}{\\\"I}{\\^I}{\\.I}",
+        "íìïîı ÍÌÏÎİ",
+    ),
+    (
+        "陳大文",
+        "陳大文",
+    ),
+    (
+        "A $4.9\\%$ increase",
+        "A 4.9% increase",
+    ),
+    (
+        "A $\\log 25$ increase",
+        "A <tex-math>\\log 25</tex-math> increase",
+    ),
+)
+
+
+@pytest.mark.parametrize("inp, out", test_cases_markup_from_latex)
+def test_markup_from_latex(inp, out):
+    markup = MarkupText.from_latex(inp)
+    assert markup.as_xml() == out

From 68cb2d0e753c1ce15e64f74ed837ac150aa21126 Mon Sep 17 00:00:00 2001
From: Marcel Bollmann <marcel@bollmann.me>
Date: Thu, 6 Mar 2025 10:49:18 +0100
Subject: [PATCH 2/3] Test and fix bug for unhandled LaTeX commands

---
 python/acl_anthology/utils/latex.py  | 2 +-
 python/tests/text/markuptext_test.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py
index a146b8ab29..a960cc2ff1 100644
--- a/python/acl_anthology/utils/latex.py
+++ b/python/acl_anthology/utils/latex.py
@@ -337,7 +337,7 @@ def _parse_nodelist_to_element(
                 log.warning(f"Unhandled LaTeX macro '{node.macroname}'")
                 subnodes = node.nodeargd.argnlist
                 _parse_nodelist_to_element(
-                    subnodes, subelem, use_fixed_case, in_macro=True
+                    subnodes, element, use_fixed_case, in_macro=True
                 )
         elif node.isNodeType(LatexGroupNode):
             # Bracketed group, such as {...} or [...]
diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py
index d872453067..ccee91274e 100644
--- a/python/tests/text/markuptext_test.py
+++ b/python/tests/text/markuptext_test.py
@@ -249,6 +249,10 @@ def test_simple_string():
         "A $\\log 25$ increase",
         "A <tex-math>\\log 25</tex-math> increase",
     ),
+    (
+        "An \\textsc{unhandled} command",
+        "An unhandled command",
+    ),
 )
 
 

From 28e6a0b845a329c8567e1f6deb66c8b4856657e5 Mon Sep 17 00:00:00 2001
From: Marcel Bollmann <marcel@bollmann.me>
Date: Thu, 6 Mar 2025 11:06:39 +0100
Subject: [PATCH 3/3] Map citations onto "(CITATION)"

---
 python/acl_anthology/utils/latex.py  | 31 ++++++++++++++++++++++------
 python/tests/text/markuptext_test.py |  4 ++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py
index a960cc2ff1..3f31f5fc27 100644
--- a/python/acl_anthology/utils/latex.py
+++ b/python/acl_anthology/utils/latex.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Functions implementing the conversion to LaTeX/BibTeX formats."""
+"""Functions implementing conversions to and from LaTeX/BibTeX formats."""
 
 from __future__ import annotations
 
@@ -45,7 +45,11 @@
     LatexMathNode,
     LatexSpecialsNode,
 )
-from pylatexenc.latex2text import LatexNodes2Text
+from pylatexenc.latex2text import (
+    LatexNodes2Text,
+    MacroTextSpec,
+    get_default_latex_context_db,
+)
 
 log = get_logger()
 
@@ -248,9 +252,16 @@ def namespecs_to_bibtex(namespecs: list[NameSpecification]) -> str:
     "bf": "b",
     "url": "url",
 }
-LATEX_TO_TEXT = LatexNodes2Text(
-    strict_latex_spaces=True,
+LATEX_CITE_MACROS = {"cite", "citep", "citet", "newcite", "citeauthor", "citeyear"}
+L2T_CONTEXT = get_default_latex_context_db()
+L2T_CONTEXT.add_context_category(
+    "citations",
+    prepend=True,
+    macros=[
+        MacroTextSpec(macro, simplify_repl=r"(CITATION)") for macro in LATEX_CITE_MACROS
+    ],
 )
+LATEX_TO_TEXT = LatexNodes2Text(strict_latex_spaces=True, latex_context=L2T_CONTEXT)
 
 
 def _is_trivial_math(node: LatexMathNode) -> bool:
@@ -315,7 +326,9 @@ def _parse_nodelist_to_element(
         None; the XML element is modified in-place.
     """
     for node in nodelist:
-        if node.isNodeType(LatexCharsNode):
+        if node is None:
+            continue
+        elif node.isNodeType(LatexCharsNode):
             # Plain text
             append_text(element, node.chars)
         elif node.isNodeType(LatexMacroNode):
@@ -327,6 +340,9 @@ def _parse_nodelist_to_element(
                 _parse_nodelist_to_element(
                     subnodes, subelem, use_fixed_case, in_macro=True
                 )
+            elif node.macroname in LATEX_CITE_MACROS:
+                # A citation command such as \cite{...}
+                append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
             elif _should_parse_macro_as_text(node):
                 # This macro should be parsed as text because it probably
                 # represents a special character, such as \v{c} or \"I
@@ -345,9 +361,12 @@ def _parse_nodelist_to_element(
                 # Protect this with <fixed-case>, then recurse
                 subelem = etree.SubElement(element, "fixed-case")
                 _parse_nodelist_to_element(node.nodelist, subelem, False)
-            else:
+            elif node.delimiters == ("{", "}"):
                 # Just recurse
                 _parse_nodelist_to_element(node.nodelist, element, use_fixed_case)
+            else:
+                # Skip [...] or <...> groups
+                pass
         elif node.isNodeType(LatexMathNode):
             # Math node
             if _is_trivial_math(node):
diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py
index ccee91274e..8b9f36f17f 100644
--- a/python/tests/text/markuptext_test.py
+++ b/python/tests/text/markuptext_test.py
@@ -253,6 +253,10 @@ def test_simple_string():
         "An \\textsc{unhandled} command",
         "An unhandled command",
     ),
+    (
+        "A citation \\cite[p.32]{doe-et-al-2024}",
+        "A citation (CITATION)",
+    ),
 )