Support attributes in HTML paths in style mappings

mwilliamson · mwilliamson · commit fe41ddbc51b4 · 2024-02-17T13:52:16.000Z
diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+# 1.7.0
+
+* Support attributes in HTML paths in style mappings.
+
 # 1.6.0
 
 * Support merged paragraphs when revisions are tracked.
diff --git a/README.md b/README.md
@@ -649,6 +649,12 @@ append a dot followed by the name of the class:
 h1.section-title
 ```
 
+To add an attribute, use square brackets similarly to a CSS attribute selector:
+
+```
+p[lang='fr']
+```
+
 To require that an element is fresh, use `:fresh`:
 
 ```
diff --git a/mammoth/html_paths.py b/mammoth/html_paths.py
@@ -7,15 +7,16 @@ def path(elements):
     return HtmlPath(elements)
 
 
-def element(names, class_names=None, fresh=None, separator=None):
+def element(names, attributes=None, class_names=None, fresh=None, separator=None):
+    if attributes is None:
+        attributes = {}
     if class_names is None:
         class_names = []
     if fresh is None:
         fresh = False
     if class_names:
-        attributes = {"class": " ".join(class_names)}
-    else:
-        attributes = {}
+        attributes["class"] = " ".join(class_names)
+
     return HtmlPathElement(html.tag(
         tag_names=names,
         attributes=attributes,
@@ -27,13 +28,13 @@ def element(names, class_names=None, fresh=None, separator=None):
 @cobble.data
 class HtmlPath(object):
     elements = cobble.field()
-    
+
     def wrap(self, generate_nodes):
         nodes = generate_nodes()
 
         for element in reversed(self.elements):
             nodes = element.wrap_nodes(nodes)
-        
+
         return nodes
 
 
diff --git a/mammoth/styles/parser/html_path_parser.py b/mammoth/styles/parser/html_path_parser.py
@@ -1,6 +1,15 @@
+import cobble
+
 from ... import html_paths
 from .tokeniser import TokenType
-from .token_parser import parse_identifier, parse_string, try_parse_class_name
+from .token_parser import parse_identifier, parse_string
+
+
+@cobble.data
+class _AttributeOrClassName(object):
+    name = cobble.field()
+    value = cobble.field()
+    append = cobble.field()
 
 
 def parse_html_path(tokens):
@@ -12,51 +21,82 @@ def parse_html_path(tokens):
 
 def _parse_html_path_elements(tokens):
     elements = []
-    
+
     if tokens.peek_token_type() == TokenType.IDENTIFIER:
         elements.append(_parse_element(tokens))
-        
+
         while tokens.try_skip_many(((TokenType.WHITESPACE, None), (TokenType.SYMBOL, ">"))):
             tokens.skip(TokenType.WHITESPACE)
             elements.append(_parse_element(tokens))
-        
+
     return elements
 
 
 def _parse_element(tokens):
     tag_names = _parse_tag_names(tokens)
-    class_names = _parse_class_names(tokens)
+    attributes_list = _parse_attribute_or_class_names(tokens)
     is_fresh = _parse_is_fresh(tokens)
     separator = _parse_separator(tokens)
-    
+
+    attributes = {}
+    for attribute in attributes_list:
+        if attribute.append and attributes.get(attribute.name):
+            attributes[attribute.name] += " " + attribute.value
+        else:
+            attributes[attribute.name] = attribute.value
+
     return html_paths.element(
         tag_names,
-        class_names=class_names,
+        attributes=attributes,
         fresh=is_fresh,
         separator=separator,
     )
 
 
 def _parse_tag_names(tokens):
     tag_names = [parse_identifier(tokens)]
-    
+
     while tokens.try_skip(TokenType.SYMBOL, "|"):
         tag_names.append(parse_identifier(tokens))
-    
+
     return tag_names
 
 
-def _parse_class_names(tokens):
-    class_names = []
-    
+def _parse_attribute_or_class_names(tokens):
+    attribute_or_class_names = []
+
     while True:
-        class_name = try_parse_class_name(tokens)
-        if class_name is None:
+        attribute_or_class_name = _try_parse_attribute_or_class_name(tokens)
+        if attribute_or_class_name is None:
             break
         else:
-            class_names.append(class_name)
-    
-    return class_names
+            attribute_or_class_names.append(attribute_or_class_name)
+
+    return attribute_or_class_names
+
+
+def _try_parse_attribute_or_class_name(tokens):
+    if tokens.is_next(TokenType.SYMBOL, "["):
+        return _parse_attribute(tokens)
+    if tokens.is_next(TokenType.SYMBOL, "."):
+        return _parse_class_name(tokens)
+    else:
+        return None
+
+
+def _parse_attribute(tokens):
+    tokens.skip(TokenType.SYMBOL, "[")
+    name = parse_identifier(tokens)
+    tokens.skip(TokenType.SYMBOL, "=")
+    value = parse_string(tokens)
+    tokens.skip(TokenType.SYMBOL, "]")
+    return _AttributeOrClassName(name=name, value=value, append=False)
+
+
+def _parse_class_name(tokens):
+    tokens.skip(TokenType.SYMBOL, ".")
+    class_name = parse_identifier(tokens)
+    return _AttributeOrClassName(name="class", value=class_name, append=True)
 
 
 def _parse_is_fresh(tokens):
diff --git a/mammoth/styles/parser/token_iterator.py b/mammoth/styles/parser/token_iterator.py
@@ -8,37 +8,36 @@ class TokenIterator(object):
     def __init__(self, tokens):
         self._tokens = tokens
         self._index = 0
-    
+
     def peek_token_type(self):
         return self._tokens[self._index].type
-    
+
     def next_value(self, token_type=None):
         return self._next(token_type).value
-    
+
     def _next(self, token_type=None):
         token = self._tokens[self._index]
         if token_type is None or token.type == token_type:
             self._index += 1
             return token
         else:
             raise self._unexpected_token_type(token_type, token)
-    
+
     def skip(self, token_type, token_value=None):
         token = self._tokens[self._index]
         if token.type == token_type and (token_value is None or token.value == token_value):
             self._index += 1
             return True
         else:
             raise self._unexpected_token_type(token_type, token)
-    
+
     def try_skip(self, token_type, token_value=None):
-        token = self._tokens[self._index]
-        if token.type == token_type and (token_value is None or token.value == token_value):
+        if self.is_next(token_type, token_value):
             self._index += 1
             return True
         else:
             return False
-    
+
     def try_skip_many(self, tokens):
         start = self._index
         for token_type, token_value in tokens:
@@ -48,9 +47,13 @@ def try_skip_many(self, tokens):
                 return False
             else:
                 self._index += 1
-        
+
         return True
-    
+
+    def is_next(self, token_type, token_value=None):
+        token = self._tokens[self._index]
+        return token.type == token_type and (token_value is None or token.value == token_value)
+
     def _unexpected_token_type(self, token_type, token):
         raise LineParseError()
-    
+
diff --git a/tests/styles/parser/html_path_parser_tests.py b/tests/styles/parser/html_path_parser_tests.py
@@ -53,6 +53,20 @@ def test_can_read_multiple_classes_on_element():
     )
 
 
+def test_can_read_attribute_on_element():
+    assert_equal(
+        html_paths.path([html_paths.element(["p"], attributes={"lang": "fr"})]),
+        read_html_path("p[lang='fr']")
+    )
+
+
+def test_can_read_multiple_attributes_on_element():
+    assert_equal(
+        html_paths.path([html_paths.element(["p"], attributes={"lang": "fr", "data-x": "y"})]),
+        read_html_path("p[lang='fr'][data-x='y']")
+    )
+
+
 def test_can_read_when_element_must_be_fresh():
     assert_equal(
         html_paths.path([html_paths.element(["p"], fresh=True)]),