Skip to content

Commit 9ca4b25

Browse files
authored
Merge pull request #22 from mkdocstrings/dev-ignore-encoding-errors
Allow specifying the encoding of the VBA source code. Replace invalid sequences instead of failing.
2 parents bd426a4 + 8c7b360 commit 9ca4b25

File tree

3 files changed

+61
-5
lines changed

3 files changed

+61
-5
lines changed

Diff for: mkdocstrings_handlers/vba/_handler.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
MutableMapping,
1313
Dict,
1414
Mapping,
15-
Set,
1615
Tuple,
1716
)
1817

@@ -40,9 +39,17 @@ class VbaHandler(BaseHandler):
4039
The directory in which to look for VBA files.
4140
"""
4241

43-
def __init__(self, *, base_dir: Path, **kwargs: Any) -> None:
42+
encoding: str
43+
"""
44+
The encoding to use when reading VBA files.
45+
Excel exports .bas and .cls files as `latin1`.
46+
See https://en.wikipedia.org/wiki/ISO/IEC_8859-1 .
47+
"""
48+
49+
def __init__(self, *, base_dir: Path, encoding: str, **kwargs: Any) -> None:
4450
super().__init__(**kwargs)
4551
self.base_dir = base_dir
52+
self.encoding = encoding
4653

4754
name: str = "vba"
4855
"""
@@ -121,9 +128,7 @@ def collect(
121128
if not p.exists():
122129
raise CollectionError("File not found.")
123130

124-
with p.open("r") as f:
125-
code = f.read()
126-
131+
code = p.read_text(encoding=self.encoding, errors="replace")
127132
code = collapse_long_lines(code)
128133

129134
return VbaModuleInfo(
@@ -178,6 +183,7 @@ def get_handler(
178183
theme: str = "material",
179184
custom_templates: str | None = None,
180185
config_file_path: str | None = None,
186+
encoding: str = "latin1",
181187
**kwargs: Any,
182188
) -> VbaHandler:
183189
"""
@@ -187,6 +193,10 @@ def get_handler(
187193
theme: The theme to use when rendering contents.
188194
custom_templates: Directory containing custom templates.
189195
config_file_path: The MkDocs configuration file path.
196+
encoding:
197+
The encoding to use when reading VBA files.
198+
Excel exports .bas and .cls files as `latin1`.
199+
See https://en.wikipedia.org/wiki/ISO/IEC_8859-1 .
190200
kwargs: Extra keyword arguments that we don't use.
191201
192202
Returns:
@@ -198,6 +208,7 @@ def get_handler(
198208
if config_file_path
199209
else Path(".").resolve()
200210
),
211+
encoding=encoding,
201212
handler="vba",
202213
theme=theme,
203214
custom_templates=custom_templates,

Diff for: test/handler/__init__.py

Whitespace-only changes.

Diff for: test/handler/test_collect.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import unittest
2+
from pathlib import Path
3+
from tempfile import TemporaryDirectory
4+
5+
from mkdocstrings_handlers.vba import get_handler
6+
7+
# noinspection PyProtectedMember
8+
from mkdocstrings_handlers.vba._types import VbaModuleInfo
9+
10+
11+
def _test_collect(*, write_bytes: bytes, read_encoding: str) -> VbaModuleInfo:
12+
with TemporaryDirectory() as tmp_dir_str:
13+
tmp_dir = Path(tmp_dir_str)
14+
handler = get_handler(encoding=read_encoding)
15+
p = tmp_dir / "source.bas"
16+
p.write_bytes(write_bytes)
17+
return handler.collect(identifier=p.as_posix(), config={})
18+
19+
20+
class TestCollect(unittest.TestCase):
21+
22+
def test_undefined_unicode(self) -> None:
23+
# See https://symbl.cc/en/unicode-table/#undefined-0 for values that are undefined in Unicode.
24+
# \xe2\xbf\xaf is utf-8 for the undefined Unicode point U+2FEF
25+
module_info = _test_collect(
26+
write_bytes=b"Foo \xe2\xbf\xaf Bar",
27+
read_encoding="utf-8",
28+
)
29+
self.assertEqual(["Foo \u2fef Bar"], module_info.source)
30+
31+
def test_invalid_utf8(self) -> None:
32+
# invalid start byte
33+
module_info = _test_collect(
34+
write_bytes=b"\x89\x89\x89\x89",
35+
read_encoding="utf-8",
36+
)
37+
self.assertEqual(["����"], module_info.source)
38+
39+
def test_invalid_latin1(self) -> None:
40+
module_info = _test_collect(
41+
write_bytes="🎵".encode("utf-8"),
42+
read_encoding="latin1",
43+
)
44+
# Since `latin1` is a single-byte encoding, it can't detect invalid sequences, and so we get mojibake.
45+
self.assertEqual([\x9f\x8eµ"], module_info.source)

0 commit comments

Comments
 (0)