Skip to content

Commit 90bda8d

Browse files
authored
Merge pull request #314 from dclong/dev
Merge dev into main
2 parents 29bd503 + 908702e commit 90bda8d

File tree

14 files changed

+1564
-1461
lines changed

14 files changed

+1564
-1461
lines changed

dsutil/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from . import git
44
from . import poetry
55

6-
__version__ = "0.68.0"
6+
__version__ = "0.68.1"

dsutil/cv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def video_to_image(
4242

4343
def resize_image(
4444
paths: Union[str, Path, Iterable[Path]], desdir: Union[str, Path, None],
45-
size: tuple[int]
45+
size: tuple[int, int]
4646
) -> None:
4747
"""Resize images to a given size.
4848

dsutil/docker/builder.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Docker related utils.
22
"""
33
from __future__ import annotations
4-
from typing import Union
4+
from typing import Optional, Union
55
from dataclasses import dataclass
66
import tempfile
77
from pathlib import Path
@@ -124,7 +124,7 @@ def __init__(
124124
git_url: str,
125125
branch: str = "dev",
126126
branch_fallback: str = "dev",
127-
repo_path: dict[str, str] = None
127+
repo_path: Optional[dict[str, str]] = None
128128
):
129129
"""Initialize a DockerImage object.
130130
@@ -299,7 +299,7 @@ def build(
299299
action="build",
300300
seconds=(time.perf_counter_ns() - time_begin) / 1E9,
301301
)
302-
except docker.errors.ImageNotFound as err:
302+
except docker.errors.ImageNotFound:
303303
return DockerActionResult(
304304
succeed=False,
305305
err_msg="",
@@ -392,7 +392,7 @@ def __init__(
392392
builder: str = _get_docker_builder(),
393393
):
394394
if isinstance(branch_urls, (str, Path)):
395-
with open(branch_urls, "r") as fin:
395+
with open(branch_urls, "r", encoding="utf-8") as fin:
396396
branch_urls = yaml.load(fin, Loader=yaml.FullLoader)
397397
self._branch_urls = branch_urls
398398
self._branch_fallback = branch_fallback
@@ -514,7 +514,7 @@ def build_graph(self):
514514
def save_graph(self, output="graph.yaml") -> None:
515515
"""Save the underlying graph structure to files.
516516
"""
517-
with open(output, "w") as fout:
517+
with open(output, "w", encoding="utf-8") as fout:
518518
# nodes and attributes
519519
fout.write("nodes:\n")
520520
for node in self._graph.nodes:
@@ -536,7 +536,7 @@ def _login_servers(self) -> None:
536536

537537
def build_images(
538538
self,
539-
tag_build: str = None,
539+
tag_build: Optional[str] = None,
540540
copy_ssh_to: str = "",
541541
push: bool = True,
542542
remove: bool = False,

dsutil/filesystem.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"""Filesystem related util functions.
33
"""
44
from __future__ import annotations
5-
from typing import Union, Iterable, Callable
5+
from typing import Optional, Union, Iterable, Callable
66
import os
77
import sys
88
import re
@@ -20,7 +20,7 @@
2020
HOME = Path.home()
2121

2222

23-
def copy_if_exists(src: str, dst: str = HOME) -> bool:
23+
def copy_if_exists(src: str, dst: Union[str, Path] = HOME) -> bool:
2424
"""Copy a file.
2525
No exception is thrown if the source file does not exist.
2626
@@ -37,7 +37,9 @@ def copy_if_exists(src: str, dst: str = HOME) -> bool:
3737
return False
3838

3939

40-
def link_if_exists(src: str, dst: str = HOME, target_is_directory: bool = True) -> bool:
40+
def link_if_exists(
41+
src: str, dst: Union[str, Path] = HOME, target_is_directory: bool = True
42+
) -> bool:
4143
"""Make a symbolic link of a file.
4244
No exception is thrown if the source file does not exist.
4345
@@ -217,7 +219,7 @@ def find_data_tables(
217219
def _find_data_tables_file(file, filter_, patterns) -> set[str]:
218220
if isinstance(file, str):
219221
file = Path(file)
220-
text = file.read_text().lower()
222+
text = file.read_text(encoding="utf-8").lower()
221223
patterns = {
222224
r"from\s+(\w+)\W*\s*",
223225
r"from\s+(\w+\.\w+)\W*\s*",
@@ -313,7 +315,7 @@ def find_ess_empty(path: Union[str, Path], ignore: Callable = _ignore) -> list[P
313315

314316

315317
def _find_ess_empty(
316-
path: Path, ignore: Callable, ess_empty: dict[Path, bool], ess_empty_dir: list[str]
318+
path: Path, ignore: Callable, ess_empty: dict[Path, bool], ess_empty_dir: list[Path]
317319
):
318320
if is_ess_empty(path=path, ignore=ignore, ess_empty=ess_empty):
319321
ess_empty_dir.append(path)
@@ -326,7 +328,9 @@ def _find_ess_empty(
326328

327329

328330
def is_ess_empty(
329-
path: Path, ignore: Callable = _ignore, ess_empty: dict[Path, bool] = None
331+
path: Path,
332+
ignore: Callable = _ignore,
333+
ess_empty: Optional[dict[Path, bool]] = None
330334
):
331335
"""Check if a directory is essentially empty.
332336
@@ -365,9 +369,9 @@ def is_ess_empty(
365369

366370
def update_file(
367371
path: Path,
368-
regex: list[tuple[str, str]] = None,
369-
exact: list[tuple[str, str]] = None,
370-
append: Union[str, Iterable[str]] = None,
372+
regex: Optional[list[tuple[str, str]]] = None,
373+
exact: Optional[list[tuple[str, str]]] = None,
374+
append: Union[None, str, Iterable[str]] = None,
371375
exist_skip: bool = True,
372376
) -> None:
373377
"""Update a text file using regular expression substitution.
@@ -382,7 +386,7 @@ def update_file(
382386
"""
383387
if isinstance(path, str):
384388
path = Path(path)
385-
text = path.read_text()
389+
text = path.read_text(encoding="utf-8")
386390
if regex:
387391
for pattern, replace in regex:
388392
text = re.sub(pattern, replace, text)
@@ -394,7 +398,7 @@ def update_file(
394398
append = "\n".join(append)
395399
if not exist_skip or append not in text:
396400
text += append
397-
path.write_text(text)
401+
path.write_text(text, encoding="utf-8")
398402

399403

400404
def get_files(dir_: Union[str, Path], exts: Union[str, list[str]]) -> Iterable[Path]:
@@ -581,7 +585,9 @@ def prune_json(input: Union[str, Path], output: Union[str, Path] = ""):
581585
else:
582586
output = input.with_name(input.stem + "_prune.json")
583587
skip = False
584-
with input.open("r") as fin, output.open("w") as fout:
588+
with input.open("r", encoding="utf-8") as fin, output.open(
589+
"w", encoding="utf-8"
590+
) as fout:
585591
for line in fin:
586592
line = line.strip()
587593
if line == '"value_counts": {':
@@ -601,7 +607,7 @@ def _filter_num(path: Union[str, Path], pattern: str, num_lines: int):
601607
results = []
602608
res = []
603609
count = 0
604-
for line in path.open():
610+
for line in path.open(encoding="utf-8"):
605611
if count > 0:
606612
res.append(line)
607613
count -= 1
@@ -623,7 +629,7 @@ def _filter_sp(path: Union[str, Path], pattern: str, sub_pattern: str):
623629
results = []
624630
res = []
625631
sub = False
626-
for line in path.open():
632+
for line in path.open(encoding="utf-8"):
627633
if sub:
628634
if re.search(sub_pattern, line):
629635
res.append(line)

dsutil/hadoop/kerberos.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def save_passwd(passwd: str) -> None:
4545
"""
4646
bytes_ = passwd.encode("ascii")
4747
encode = base64.b64encode(bytes_).decode()
48-
with open(PROFILE, "w") as fout:
48+
with open(PROFILE, "w", encoding="utf-8") as fout:
4949
fout.write(encode)
5050
os.chmod(PROFILE, 0o600)
5151

@@ -58,7 +58,7 @@ def read_passwd() -> str:
5858
os.chmod(PROFILE, 0o600)
5959
if not os.path.isfile(PROFILE):
6060
return ""
61-
with open(PROFILE, "r") as fin:
61+
with open(PROFILE, "r", encoding="utf-8") as fin:
6262
return base64.b64decode(fin.read()).decode()
6363

6464

@@ -170,7 +170,7 @@ def _read_config(config: Union[Path, str]) -> Dict[str, Any]:
170170
config = Path(config)
171171
if not config.is_file():
172172
return {"email": {}}
173-
with config.open("r") as fin:
173+
with config.open("r", encoding="utf-8") as fin:
174174
return yaml.load(fin, Loader=yaml.FullLoader)
175175

176176

dsutil/hadoop/log.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def __init__(
8080
log_file: Union[str, Path],
8181
context_size: int = 5,
8282
keywords: Sequence[str] = KEYWORDS,
83-
patterns: Sequence[str] = PATTERNS,
83+
patterns: Sequence[tuple[str, str]] = PATTERNS,
8484
output: Union[str, Path] = "",
8585
threshold: float = 0.7,
8686
dump_by_keyword: bool = False,
@@ -89,11 +89,11 @@ def __init__(
8989
if isinstance(log_file, Path) else Path(log_file)).resolve()
9090
self._context_size: int = context_size
9191
self._keywords: Sequence[str] = keywords
92-
self._patterns: Sequence[str] = patterns
92+
self._patterns: Sequence[tuple[str, str]] = patterns
9393
self._num_rows: int = 0
9494
self._lookup: dict[str, dict[str, int]] = {kwd: {} for kwd in self._keywords}
9595
self._queue: deque = deque()
96-
self._output: str = self._get_output(output)
96+
self._output: Path = self._get_output(output)
9797
self._threshold: float = threshold
9898
self._dump_by_keyword: bool = dump_by_keyword
9999

@@ -153,15 +153,15 @@ def _count_rows(self):
153153
if self._num_rows:
154154
return
155155
logger.info("Counting total number of rows ...")
156-
with open(self._log_file, "r") as fin:
156+
with open(self._log_file, "r", encoding="utf-8") as fin:
157157
self._num_rows = sum(1 for line in fin)
158158
logger.info("Total number of rows: {:,}", self._num_rows)
159159

160160
def _scan_error_lines(self) -> None:
161161
print()
162162
logger.info("Scanning for error lines in the log ...")
163163
lines = [DASH_50 + " Possible Error Lines " + DASH_50 + "\n"]
164-
with open(self._log_file, "r") as fin:
164+
with open(self._log_file, "r", encoding="utf-8") as fin:
165165
dump_flag = -1
166166
for idx, line in tqdm(enumerate(fin), total=self._num_rows):
167167
self._queue.append(f"L{idx}: {line}")
@@ -179,7 +179,7 @@ def _scan_error_lines(self) -> None:
179179
dump_flag = -1
180180
if dump_flag >= 0:
181181
self._dump_queue(lines)
182-
with open(self._output, "w") as fout:
182+
with open(self._output, "w", encoding="utf-8") as fout:
183183
fout.writelines(lines)
184184
logger.info("Possible Error Lines have been dumped into {}", self._output)
185185

@@ -210,7 +210,7 @@ def _dedup_log(self):
210210
lines_unique.extend(self._dedup_log_1(kwd, lines, dir_))
211211
lines_unique = [self._error_priority(line) for line in lines_unique]
212212
lines_unique.sort()
213-
with self._output.open("a") as fout:
213+
with self._output.open("a", encoding="utf-8") as fout:
214214
self._write_lines_unique(lines_unique, fout)
215215
self._write_lines_unique(lines_unique, sys.stdout)
216216

dsutil/hadoop/pyspark_submit.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -135,23 +135,23 @@ def submit(self, cmd: str, attachments: Union[None, list[str]] = None) -> bool:
135135
logger.info("Submitting Spark job ...\n{}", cmd)
136136
stdout = []
137137
self._spark_submit_log.clear()
138-
process = sp.Popen(cmd, shell=True, stderr=sp.PIPE)
139-
while True:
140-
if process.poll() is None:
141-
line = process.stderr.readline().decode().rstrip() # pytype: disable=attribute-error
142-
line = self._filter(line, time_begin, self._spark_log_filter)
143-
if line:
144-
print(line)
145-
stdout.append(line)
146-
else:
147-
for line in process.stderr.readlines(): # pytype: disable=attribute-error
148-
line = self._filter(
149-
line.decode().rstrip(), time_begin, self._spark_log_filter
150-
)
138+
with sp.Popen(cmd, shell=True, stderr=sp.PIPE) as process:
139+
while True:
140+
if process.poll() is None:
141+
line = process.stderr.readline().decode().rstrip() # pytype: disable=attribute-error
142+
line = self._filter(line, time_begin, self._spark_log_filter)
151143
if line:
152144
print(line)
153145
stdout.append(line)
154-
break
146+
else:
147+
for line in process.stderr.readlines(): # pytype: disable=attribute-error
148+
line = self._filter(
149+
line.decode().rstrip(), time_begin, self._spark_log_filter
150+
)
151+
if line:
152+
print(line)
153+
stdout.append(line)
154+
break
155155
# status
156156
status = self._final_status(stdout)
157157
app_id = self._app_id(stdout)
@@ -376,7 +376,7 @@ def submit(args: Namespace) -> None:
376376
if not args.config:
377377
config = {}
378378
else:
379-
with open(args.config, "r") as fin:
379+
with open(args.config, "r", encoding="utf-8") as fin:
380380
config = yaml.load(fin, Loader=yaml.FullLoader)
381381
# handle various options
382382
if args.spark_submit_local:

dsutil/hash.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ def rmd5(path: Union[str, Path], output: Union[str, Path] = "") -> str:
2323
if output:
2424
if isinstance(output, str):
2525
output = Path(output)
26-
with output.open("w") as fout:
26+
with output.open("w", encoding="utf-8") as fout:
2727
fout.write(text)
2828
return hashlib.md5(text.encode()).hexdigest()
2929

3030

31-
def _rmd5(path: Path, res: list[tuple[str]]) -> None:
31+
def _rmd5(path: Path, res: list[str]) -> None:
3232
"""Helper function of rmd5.
3333
3434
:param path: The Path object of a file or directory.

dsutil/poetry.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""This module makes it easy to work with poetry to managing your Python project.
22
"""
33
from __future__ import annotations
4-
from typing import Union, Iterable
4+
from typing import Optional, Union, Iterable
55
from pathlib import Path
66
import sys
77
import os
@@ -101,7 +101,7 @@ def _update_version(ver: str, proj_dir: Path) -> None:
101101
def version(
102102
ver: str = "",
103103
commit: bool = False,
104-
proj_dir: Path = None,
104+
proj_dir: Optional[Path] = None,
105105
) -> None:
106106
"""List or update the version of the package.
107107
@@ -166,7 +166,7 @@ def add_tag_release(
166166
def format_code(
167167
inplace: bool = False,
168168
commit: bool = False,
169-
proj_dir: Path = None,
169+
proj_dir: Optional[Path] = None,
170170
files: Iterable[Union[Path, str]] = ()
171171
) -> None:
172172
"""Format code.
@@ -304,7 +304,9 @@ def build_package(
304304
sp.run(f"cd '{proj_dir}' && poetry build", shell=True, check=True)
305305

306306

307-
def clean(proj_dir: Path = None, ignore: Union[str, Path, None] = None) -> None:
307+
def clean(
308+
proj_dir: Optional[Path] = None, ignore: Union[str, Path, None] = None
309+
) -> None:
308310
"""Remove non-essential files from the current project.
309311
310312
:param proj_dir: The root directory of the Poetry project.
@@ -319,7 +321,7 @@ def clean(proj_dir: Path = None, ignore: Union[str, Path, None] = None) -> None:
319321
if not ignore.is_file():
320322
return
321323
logger.info("Use the GitIgnore file: {}", ignore)
322-
with ignore.open("r") as fin:
324+
with ignore.open("r", encoding="utf-8") as fin:
323325
patterns = [line.strip() for line in fin]
324326
spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, patterns)
325327
_clean(proj_dir, spec)

0 commit comments

Comments
 (0)