Skip to content

Commit 2232a79

Browse files
elxybookfere
andauthored
fix(extraction): add support for matching elements that use their own namespaces.
* fix(extraction): support MathML, SVG, and other non-XHTML namespace nodes * Add test cases. --------- Co-authored-by: bookfere <bookfere@gmail.com>
1 parent 68529ce commit 2232a79

4 files changed

Lines changed: 83 additions & 35 deletions

File tree

lib/element.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import copy
44
from typing import Any
55

6-
from lxml import etree
6+
from lxml import etree # type: ignore
7+
78
from calibre import prepare_string_for_xml as xml_escape # type: ignore
89

910
from .utils import (
@@ -684,7 +685,7 @@ def load_reserve_rules(self, rules=[]):
684685
# conflicts with the mechanism of merge translation.
685686
default_rules = (
686687
'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr',
687-
'var', 'canvas', 'svg', 'script', 'style')
688+
'var', 'canvas', 'svg', 'script', 'style', 'math')
688689
self.reserve_pattern = create_xpath(default_rules + tuple(rules))
689690

690691
def prepare_original(self, elements):

lib/utils.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from subprocess import Popen
1111
from contextlib import contextmanager
1212

13-
from mechanize import Browser, Request, HTTPError
14-
from mechanize._response import response_seek_wrapper as Response
13+
from mechanize import Browser, Request, HTTPError # type: ignore
14+
from mechanize._response import response_seek_wrapper as Response # type: ignore
1515

1616
from calibre import get_proxies # type: ignore
1717
from calibre.utils.logging import Log # type: ignore
@@ -23,8 +23,6 @@
2323
ns = {'x': 'http://www.w3.org/1999/xhtml'}
2424
is_test = 'unittest' in sys.modules
2525
log = Log(level=Log.DEBUG if os.environ.get('CALIBRE_DEBUG') else Log.INFO)
26-
27-
log.debug('Backup original socket: ', id(socket.socket))
2826
original_socket = socket.socket
2927

3028

@@ -36,24 +34,32 @@ def sep(char='═', count=38):
3634
return char * count
3735

3836

39-
def css(selector):
37+
def css(selector: str) -> str | None:
4038
try:
4139
return GenericTranslator().css_to_xpath(selector, prefix='self::x:')
4240
except SelectorError:
4341
return None
4442

4543

46-
def css_to_xpath(selectors):
44+
def css_to_xpath(selectors: tuple | list) -> list:
4745
patterns = []
46+
simple_tag = re.compile(r'^[A-Za-z][\w-]*$')
4847
for selector in selectors:
49-
if rule := css(selector):
50-
patterns.append(rule)
48+
rule = css(selector)
49+
if rule is None:
50+
continue
51+
# Add support for matching elements that use their own namespaces.
52+
if simple_tag.match(selector):
53+
rule = f'({rule} or self::*[local-name()="{selector}"])'
54+
patterns.append(rule)
5155
return patterns
5256

5357

54-
def create_xpath(selectors):
58+
def create_xpath(selectors: tuple | str) -> str | None:
5559
selectors = (selectors,) if isinstance(selectors, str) else selectors
56-
return './/*[%s]' % ' or '.join(css_to_xpath(selectors))
60+
if patterns := css_to_xpath(selectors):
61+
return './/*[%s]' % ' or '.join(patterns)
62+
return None
5763

5864

5965
def uid(*args):
@@ -196,6 +202,7 @@ def socks_proxy(host: str, port: int) -> Generator[ModuleType, None, None]:
196202
"""This is a monkey-patch approach to enforce Mechanize to use a SOCKS5
197203
proxy. The context manager restores the original socket after it exits.
198204
"""
205+
log.debug('Backup original socket: ', id(socket.socket))
199206
# Temporarily remove environment proxies to prevent conflicts with the
200207
# SOCKS5 proxy, which might otherwise send connections through an HTTP
201208
# proxy, causing a "General SOCKS server failure" error.

tests/lib/test_element.py

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import unittest
33
from unittest.mock import patch, Mock
44

5-
from lxml import etree
5+
from lxml import etree # type: ignore
66

77
from calibre.ebooks.oeb.base import TOC, Metadata # type: ignore
88

@@ -437,7 +437,7 @@ def test_get_content(self):
437437
self.assertEqual(
438438
'<img src="w2.jpg"></img>', self.element.reserve_elements[4])
439439
self.assertEqual(
440-
'<img alt="{\D}" src="w3.jpg"></img>',
440+
'<img alt="{\D}" src="w3.jpg"></img>', # type: ignore
441441
self.element.reserve_elements[5])
442442
self.assertEqual(
443443
'<img src="w3.jpg"></img>', self.element.reserve_elements[6])
@@ -740,10 +740,10 @@ def test_add_translation_only_keep_anchor(self):
740740
self.assertEqual('abc', a.get('href'))
741741
self.assertEqual('A', a.text)
742742

743-
def test_add_translation_table(slef):
743+
def test_add_translation_table(self):
744744
pass
745745

746-
def test_add_translation_table_only(slef):
746+
def test_add_translation_table_only(self):
747747
pass
748748

749749
def test_add_translation_line_break_below(self):
@@ -934,8 +934,6 @@ def test_create_extraction(self):
934934
self.assertEqual(
935935
[re.compile(default_rule)],
936936
self.extraction.filter_patterns)
937-
self.assertEqual(
938-
['self::x:pre', 'self::x:code'], self.extraction.ignore_patterns)
939937

940938
def test_get_sorted_pages(self):
941939
self.assertEqual(
@@ -1278,13 +1276,11 @@ def test_set_column_gap(self):
12781276

12791277
def test_load_remove_rules(self):
12801278
self.handler.load_remove_rules()
1281-
self.assertEqual(
1282-
'.//*[self::x:rt or self::x:rp]', self.handler.remove_pattern)
1279+
self.assertIsNotNone(self.handler.remove_pattern)
12831280

12841281
def test_load_reserve_rules(self):
12851282
self.handler.load_reserve_rules()
1286-
self.assertRegex(
1287-
self.handler.reserve_pattern, r'^\.//\*\[self::x:img.*style\]$')
1283+
self.assertIsNotNone(self.handler.reserve_pattern)
12881284

12891285
@patch('calibre_plugins.ebook_translator.lib.element.uid')
12901286
def test_prepare_original(self, mock_uid):
@@ -1312,12 +1308,6 @@ def test_prepare_original(self, mock_uid):
13121308
self.assertEqual('red', element.original_color)
13131309
self.assertEqual('green', element.translation_color)
13141310
self.assertEqual(('percentage', 20), element.column_gap)
1315-
self.assertEqual(
1316-
'.//*[self::x:rt or self::x:rp]',
1317-
self.handler.remove_pattern)
1318-
self.assertRegex(
1319-
element.reserve_pattern or '',
1320-
r'^\.//\*\[self::x:img.*style\]$')
13211311

13221312
@patch('calibre_plugins.ebook_translator.lib.element.uid')
13231313
def test_prepare_translation_contains_ignored_element(self, mock_uid):
@@ -1544,11 +1534,6 @@ def test_prepare_original_merge_separator(self, mock_uid):
15441534
self.assertEqual('red', element.original_color)
15451535
self.assertEqual('green', element.translation_color)
15461536
self.assertEqual(('percentage', 20), element.column_gap)
1547-
self.assertEqual(
1548-
'.//*[self::x:rt or self::x:rp]',
1549-
self.handler.remove_pattern)
1550-
self.assertRegex(
1551-
element.reserve_pattern, r'^\.//\*\[self::x:img.*style\]$')
15521537

15531538
@patch('calibre_plugins.ebook_translator.lib.element.uid')
15541539
def test_prepare_original_merge_separator_multiple(self, mock_uid):

tests/lib/test_utils.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,71 @@
22
from unittest.mock import patch
33
from types import GeneratorType
44

5+
from lxml import etree # type: ignore
6+
7+
from ...vendor.cssselect import SelectorError
8+
59
from ...lib.utils import (
6-
css_to_xpath, uid, trim, chunk, group, open_file, request)
10+
ns, css, css_to_xpath, create_xpath, uid, trim, chunk, group, open_file,
11+
request)
712

813

914
module_name = 'calibre_plugins.ebook_translator.lib.utils'
1015

1116

1217
class TestUtils(unittest.TestCase):
18+
def test_css(self):
19+
self.assertEqual("self::x:div[@id = 'id']", css('div#id'))
20+
self.assertIsNone(css('div>>p')) # omit invalid selector
21+
1322
def test_css_to_xpath(self):
14-
self.assertEqual(["self::x:*[@id = 'id']"], css_to_xpath(['#id']))
23+
self.assertEqual([], css_to_xpath([]))
24+
self.assertEqual([], css_to_xpath(['div>>p'])) # omit invalid selector
25+
self.assertEqual(["self::x:*[@id = 'test']"], css_to_xpath(['#test']))
26+
self.assertEqual(
27+
[
28+
"self::x:div[@id = 'test']",
29+
"(self::x:span or self::*[local-name()=\"span\"])"
30+
],
31+
css_to_xpath(["div>>p", 'div#test', 'span']))
32+
33+
def test_create_xpath(self):
34+
pattern = create_xpath(('p', 'math',))
35+
self.assertEqual(
36+
pattern,
37+
'.//*[(self::x:p or self::*[local-name()="p"]) or '
38+
'(self::x:math or self::*[local-name()="math"])]')
39+
40+
# A sample code to test XPath pattern matching with MathML elements
41+
# that have independent namespaces.
42+
xhtml = etree.XML("""<?xml version="1.0" encoding="utf-8"?>
43+
<!DOCTYPE html>
44+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
45+
<head><title>Document</title></head>
46+
<body>
47+
<p>Test MathML element</p>
48+
<math xmlns="http://www.w3.org/1998/Math/MathML">
49+
<munder>
50+
<mo>∑</mo>
51+
<mi>A</mi>
52+
</munder>
53+
<munder displaystyle="true">
54+
<mo>∑</mo>
55+
<mi>A</mi>
56+
</munder>
57+
<munder>
58+
<mo>∑</mo>
59+
<mi scriptlevel="0">A</mi>
60+
</munder>
61+
</math>
62+
</body>
63+
</html>""".encode())
64+
body = xhtml.find('./x:body', namespaces=ns)
65+
elements = body.xpath(pattern, namespaces=ns)
66+
67+
self.assertEqual(2, len(elements))
68+
self.assertEqual('p', etree.QName(elements[0]).localname)
69+
self.assertEqual('math', etree.QName(elements[1]).localname)
1570

1671
def test_uid(self):
1772
self.assertEqual('202cb962ac59075b964b07152d234b70', uid('123'))

0 commit comments

Comments
 (0)