Skip to content

Commit 619c09e

Browse files
committed
Handle files not encoded with UTF-8
Black can format files encoded with an encoding other than UTF-8, based on the encoding cookie present in the first two lines. Darker is also supposed to format those files, but an exception is raised instead. The exception happens because all the output from git is decoded with UTF-8, including calls to `git show <revision>:<path>` that returns the binary content of a file from the history. If a file used an encoding other than UTF-8, the decoding fails. `TextDocument.from_bytes()` can now create a `TextDocument` from the raw binary content of a file, automatically detecting the encoding used. From now on, this new factory is used when loading a file from disk or from a revision in the git history.
1 parent b7ed756 commit 619c09e

File tree

6 files changed

+180
-59
lines changed

6 files changed

+180
-59
lines changed

CHANGES.rst

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Added
1212
Fixed
1313
-----
1414
- Avoid memory leak from using ``@lru_cache`` on a method.
15+
- Handle files encoded with an encoding other than UTF-8 without an exception.
1516

1617

1718
1.4.2_ - 2022-03-12

src/darker/git.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from functools import lru_cache
1010
from pathlib import Path
1111
from subprocess import DEVNULL, PIPE, CalledProcessError, check_output, run # nosec
12-
from typing import Dict, Iterable, List, Set, Tuple
12+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, overload
1313

1414
from darker.diff import diff_and_get_opcodes, opcodes_to_edit_linenums
1515
from darker.utils import GIT_DATEFORMAT, TextDocument
@@ -79,8 +79,8 @@ def git_get_content_at_revision(path: Path, revision: str, cwd: Path) -> TextDoc
7979
return TextDocument.from_file(abspath)
8080
cmd = ["show", f"{revision}:./{path.as_posix()}"]
8181
try:
82-
return TextDocument.from_lines(
83-
_git_check_output_lines(cmd, cwd, exit_on_error=False),
82+
return TextDocument.from_bytes(
83+
_git_check_output(cmd, cwd, exit_on_error=False),
8484
mtime=git_get_mtime_at_commit(path, revision, cwd),
8585
)
8686
except CalledProcessError as exc_info:
@@ -207,15 +207,45 @@ def _git_check_output_lines(
207207
cmd: List[str], cwd: Path, exit_on_error: bool = True
208208
) -> List[str]:
209209
"""Log command line, run Git, split stdout to lines, exit with 123 on error"""
210+
return _git_check_output(
211+
cmd,
212+
cwd,
213+
exit_on_error=exit_on_error,
214+
encoding="utf-8",
215+
).splitlines()
216+
217+
218+
@overload
219+
def _git_check_output(
220+
cmd: List[str], cwd: Path, *, exit_on_error: bool = ..., encoding: None = ...
221+
) -> bytes:
222+
...
223+
224+
225+
@overload
226+
def _git_check_output(
227+
cmd: List[str], cwd: Path, *, exit_on_error: bool = ..., encoding: str
228+
) -> str:
229+
...
230+
231+
232+
def _git_check_output(
233+
cmd: List[str],
234+
cwd: Path,
235+
*,
236+
exit_on_error: bool = True,
237+
encoding: Optional[str] = None,
238+
) -> Union[str, bytes]:
239+
"""Log command line, run Git, return stdout, exit with 123 on error"""
210240
logger.debug("[%s]$ git %s", cwd, " ".join(cmd))
211241
try:
212242
return check_output( # nosec
213243
["git"] + cmd,
214244
cwd=str(cwd),
215-
encoding="utf-8",
245+
encoding=encoding,
216246
stderr=PIPE,
217247
env=_make_git_env(),
218-
).splitlines()
248+
)
219249
except CalledProcessError as exc_info:
220250
if not exit_on_error:
221251
raise

src/darker/tests/conftest.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
from pathlib import Path
55
from subprocess import check_call # nosec
6-
from typing import Dict, Optional
6+
from typing import Dict, Union
77

88
import pytest
99
from black import find_project_root as black_find_project_root
@@ -39,7 +39,7 @@ def _run_and_get_first_line(self, *args: str) -> str:
3939
return _git_check_output_lines(list(args), Path(self.root))[0]
4040

4141
def add(
42-
self, paths_and_contents: Dict[str, Optional[str]], commit: str = None
42+
self, paths_and_contents: Dict[str, Union[str, bytes, None]], commit: str = None
4343
) -> Dict[str, Path]:
4444
"""Add/remove/modify files and optionally commit the changes
4545
@@ -58,10 +58,12 @@ def add(
5858
path = absolute_paths[relative_path]
5959
if content is None:
6060
self._run("rm", "--", relative_path)
61-
else:
62-
path.parent.mkdir(parents=True, exist_ok=True)
63-
path.write_bytes(content.encode("utf-8"))
64-
self._run("add", "--", relative_path)
61+
continue
62+
if isinstance(content, str):
63+
content = content.encode("utf-8")
64+
path.parent.mkdir(parents=True, exist_ok=True)
65+
path.write_bytes(content)
66+
self._run("add", "--", relative_path)
6567
if commit:
6668
self._run("commit", "-m", commit)
6769
return absolute_paths

src/darker/tests/test_git.py

+73-20
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,17 @@ def test_revisionrange_parse(revision_range, expect):
139139
assert result == expect
140140

141141

142+
def git_call(cmd, encoding=None):
143+
"""Returns a mocked call to git"""
144+
return call(
145+
cmd.split(),
146+
cwd=str(Path("/path")),
147+
encoding=encoding,
148+
stderr=PIPE,
149+
env={"LC_ALL": "C", "PATH": os.environ["PATH"]},
150+
)
151+
152+
142153
@pytest.mark.kwparametrize(
143154
dict(
144155
revision=":WORKTREE:",
@@ -147,31 +158,31 @@ def test_revisionrange_parse(revision_range, expect):
147158
dict(
148159
revision="HEAD",
149160
expect_git_calls=[
150-
"git show HEAD:./my.txt",
151-
"git log -1 --format=%ct HEAD -- my.txt",
161+
git_call("git show HEAD:./my.txt"),
162+
git_call("git log -1 --format=%ct HEAD -- my.txt", encoding="utf-8"),
152163
],
153164
expect_textdocument_calls=[
154-
call.from_lines([b"1627107028"], mtime="2021-07-24 06:10:28.000000 +0000")
165+
call.from_bytes(b"1627107028", mtime="2021-07-24 06:10:28.000000 +0000")
155166
],
156167
),
157168
dict(
158169
revision="HEAD^",
159170
expect_git_calls=[
160-
"git show HEAD^:./my.txt",
161-
"git log -1 --format=%ct HEAD^ -- my.txt",
171+
git_call("git show HEAD^:./my.txt"),
172+
git_call("git log -1 --format=%ct HEAD^ -- my.txt", encoding="utf-8"),
162173
],
163174
expect_textdocument_calls=[
164-
call.from_lines([b"1627107028"], mtime="2021-07-24 06:10:28.000000 +0000")
175+
call.from_bytes(b"1627107028", mtime="2021-07-24 06:10:28.000000 +0000")
165176
],
166177
),
167178
dict(
168179
revision="master",
169180
expect_git_calls=[
170-
"git show master:./my.txt",
171-
"git log -1 --format=%ct master -- my.txt",
181+
git_call("git show master:./my.txt"),
182+
git_call("git log -1 --format=%ct master -- my.txt", encoding="utf-8"),
172183
],
173184
expect_textdocument_calls=[
174-
call.from_lines([b"1627107028"], mtime="2021-07-24 06:10:28.000000 +0000")
185+
call.from_bytes(b"1627107028", mtime="2021-07-24 06:10:28.000000 +0000")
175186
],
176187
),
177188
expect_git_calls=[],
@@ -189,17 +200,7 @@ def test_git_get_content_at_revision_obtain_file_content(
189200

190201
git.git_get_content_at_revision(Path("my.txt"), revision, Path("/path"))
191202

192-
expected_calls = [
193-
call(
194-
expected_call.split(),
195-
cwd=str(Path("/path")),
196-
encoding="utf-8",
197-
stderr=PIPE,
198-
env={"LC_ALL": "C", "PATH": os.environ["PATH"]},
199-
)
200-
for expected_call in expect_git_calls
201-
]
202-
assert check_output.call_args_list == expected_calls
203+
assert check_output.call_args_list == expect_git_calls
203204
assert text_document_class.method_calls == expect_textdocument_calls
204205

205206

@@ -403,6 +404,58 @@ def test_git_get_content_at_revision_stderr(git_repo, capfd, caplog):
403404
assert caplog.text == ""
404405

405406

407+
@pytest.fixture(scope="module")
408+
def encodings_repo(tmp_path_factory):
409+
"""Create an example Git repository using various encodings for the same file"""
410+
tmpdir = tmp_path_factory.mktemp("branched_repo")
411+
git_repo = GitRepoFixture.create_repository(tmpdir)
412+
# Commit without an encoding cookie, defaults to utf-8
413+
git_repo.add({"file.py": "darker = 'plus foncé'\n"}, commit="Default encoding")
414+
git_repo.create_tag("default")
415+
# Commit without an encoding cookie but with a utf-8 signature
416+
content = "darker = 'plus foncé'\n".encode("utf-8-sig")
417+
git_repo.add({"file.py": content}, commit="utf-8-sig")
418+
git_repo.create_tag("utf-8-sig")
419+
# Commit with an iso-8859-1 encoding cookie
420+
content = "# coding: iso-8859-1\ndarker = 'plus foncé'\n".encode("iso-8859-1")
421+
git_repo.add({"file.py": content}, commit="iso-8859-1")
422+
git_repo.create_tag("iso-8859-1")
423+
# Commit with a utf-8 encoding cookie
424+
content = "# coding: utf-8\npython = 'パイソン'\n".encode("utf-8")
425+
git_repo.add({"file.py": content}, commit="utf-8")
426+
git_repo.create_tag("utf-8")
427+
# Current worktree content (not committed) with a shitfjs encoding cookie
428+
content = "# coding: shiftjis\npython = 'パイソン'\n".encode("shiftjis")
429+
git_repo.add({"file.py": content})
430+
return git_repo
431+
432+
433+
@pytest.mark.kwparametrize(
434+
dict(commit="default", encoding="utf-8", lines=("darker = 'plus foncé'",)),
435+
dict(commit="utf-8-sig", encoding="utf-8-sig", lines=("darker = 'plus foncé'",)),
436+
dict(
437+
commit="iso-8859-1",
438+
encoding="iso-8859-1",
439+
lines=("# coding: iso-8859-1", "darker = 'plus foncé'"),
440+
),
441+
dict(
442+
commit="utf-8", encoding="utf-8", lines=("# coding: utf-8", "python = 'パイソン'")
443+
),
444+
dict(
445+
commit=":WORKTREE:",
446+
encoding="shiftjis",
447+
lines=("# coding: shiftjis", "python = 'パイソン'"),
448+
),
449+
)
450+
def test_git_get_content_at_revision_encoding(encodings_repo, commit, encoding, lines):
451+
"""Git file is loaded using its historical encoding"""
452+
result = git.git_get_content_at_revision(
453+
Path("file.py"), commit, encodings_repo.root
454+
)
455+
assert result.encoding == encoding
456+
assert result.lines == lines
457+
458+
406459
@pytest.mark.kwparametrize(
407460
dict(retval=0, expect=True),
408461
dict(retval=1, expect=False),

src/darker/tests/test_utils.py

+46-19
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Unit tests for :mod:`darker.utils`"""
22

3+
# pylint: disable=redefined-outer-name, comparison-with-callable
4+
35
import logging
46
import os
57
from pathlib import Path
@@ -17,6 +19,39 @@
1719
)
1820

1921

22+
@pytest.fixture(params=[TextDocument.from_file, TextDocument.from_bytes])
23+
def textdocument_factory(request):
24+
"""Fixture for a factory function that creates a ``TextDocument``
25+
26+
The fixture can be parametrized with `(bytes) -> TextDocument` functions
27+
that take the raw bytes of the document.
28+
29+
By default, it is parametrized with the ``TextDocument.from_file()`` (for
30+
which it creates a temporary file) and the ``TextDocument.from_bytes()``
31+
classmethods.
32+
"""
33+
if request.param == TextDocument.from_file:
34+
35+
def factory(content):
36+
tmp_path = request.getfixturevalue("tmp_path")
37+
path = tmp_path / "test.py"
38+
path.write_bytes(content)
39+
return TextDocument.from_file(path)
40+
41+
return factory
42+
43+
return request.param
44+
45+
46+
@pytest.fixture
47+
def textdocument(request, textdocument_factory):
48+
"""Fixture for a ``TextDocument``
49+
50+
The fixture must be parametrized with the raw bytes of the document.
51+
"""
52+
return textdocument_factory(request.param)
53+
54+
2055
@pytest.mark.kwparametrize(
2156
dict(string="", expect="\n"),
2257
dict(string="\n", expect="\n"),
@@ -225,31 +260,23 @@ def test_textdocument_from_str(
225260

226261

227262
@pytest.mark.kwparametrize(
228-
dict(content=b'print("touch\xc3\xa9")\n', expect="utf-8"),
229-
dict(content=b'\xef\xbb\xbfprint("touch\xc3\xa9")\n', expect="utf-8-sig"),
230-
dict(content=b'# coding: iso-8859-1\n"touch\xe9"\n', expect="iso-8859-1"),
263+
dict(textdocument=b'print("touch\xc3\xa9")\n', expect="utf-8"),
264+
dict(textdocument=b'\xef\xbb\xbfprint("touch\xc3\xa9")\n', expect="utf-8-sig"),
265+
dict(textdocument=b'# coding: iso-8859-1\n"touch\xe9"\n', expect="iso-8859-1"),
266+
indirect=["textdocument"],
231267
)
232-
def test_textdocument_from_file_detect_encoding(tmp_path, content, expect):
233-
"""TextDocument.from_file() detects the file encoding correctly"""
234-
path = tmp_path / "test.py"
235-
path.write_bytes(content)
236-
237-
textdocument = TextDocument.from_file(path)
238-
268+
def test_textdocument_detect_encoding(textdocument, expect):
269+
"""TextDocument.from_file/bytes() detects the file encoding correctly"""
239270
assert textdocument.encoding == expect
240271

241272

242273
@pytest.mark.kwparametrize(
243-
dict(content=b'print("unix")\n', expect="\n"),
244-
dict(content=b'print("windows")\r\n', expect="\r\n"),
274+
dict(textdocument=b'print("unix")\n', expect="\n"),
275+
dict(textdocument=b'print("windows")\r\n', expect="\r\n"),
276+
indirect=["textdocument"],
245277
)
246-
def test_textdocument_from_file_detect_newline(tmp_path, content, expect):
247-
"""TextDocument.from_file() detects the newline character sequence correctly"""
248-
path = tmp_path / "test.py"
249-
path.write_bytes(content)
250-
251-
textdocument = TextDocument.from_file(path)
252-
278+
def test_textdocument_detect_newline(textdocument, expect):
279+
"""TextDocument.from_file/bytes() detects the newline sequence correctly"""
253280
assert textdocument.newline == expect
254281

255282

src/darker/utils.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -106,22 +106,30 @@ def from_str(
106106
newline = override_newline
107107
return cls(string, None, encoding=encoding, newline=newline, mtime=mtime)
108108

109+
@classmethod
110+
def from_bytes(cls, data: bytes, mtime: str = "") -> "TextDocument":
111+
"""Create a document object from a binary string
112+
113+
:param data: The binary content of the new text document
114+
:param mtime: The modification time of the original file
115+
116+
"""
117+
srcbuf = io.BytesIO(data)
118+
encoding, lines = tokenize.detect_encoding(srcbuf.readline)
119+
if not lines:
120+
return cls(lines=[], encoding=encoding, mtime=mtime)
121+
return cls.from_str(data.decode(encoding), encoding=encoding, mtime=mtime)
122+
109123
@classmethod
110124
def from_file(cls, path: Path) -> "TextDocument":
111-
"""Create a document object by reading an UTF-8 encoded text file
125+
"""Create a document object by reading a text file
112126
113127
Also store the last modification time of the file.
114128
115129
"""
116130
mtime = datetime.utcfromtimestamp(path.stat().st_mtime).strftime(GIT_DATEFORMAT)
117-
srcbuf = path.open("rb")
118-
encoding, lines = tokenize.detect_encoding(srcbuf.readline)
119-
if not lines:
120-
return cls(lines=[], encoding=encoding)
121-
srcbuf.seek(0)
122-
return cls.from_str(
123-
srcbuf.read().decode(encoding), encoding=encoding, mtime=mtime
124-
)
131+
with path.open("rb") as srcbuf:
132+
return cls.from_bytes(srcbuf.read(), mtime)
125133

126134
@classmethod
127135
def from_lines(

0 commit comments

Comments
 (0)