Skip to content

Commit 670c03d

Browse files
committed
Optimize parsing
A few changes: 1. Switch from `multiprocessing` to `concurrent.futures` with `ProcessPoolExecutor`. This is just to make the code easier to work with. 2. Perform `check_file()` as results return to the main process, in order to reduce peak memory usage. Previously all parsed files were kept in memory before being checked, leading to massive memory usage. 3. Construct parsers once per process, rather than once per file. Previously parser construction took ~5% of the runtime, this reduces it to a constant amount. Benchmarked on a project with 238 templates. Before: ``` $ time curlylint templates/**/*.html All done! ✨ 🍰 ✨ curlylint templates/**/*.html 352.25s user 3.37s system 999% cpu 35.575 total ``` After: ``` $ time curlylint templates/**/*.html All done! ✨ 🍰 ✨ curlylint templates/**/*.html 324.22s user 2.79s system 995% cpu 32.858 total ``` ~8% of the time saved. The parser remains quite slow, I think it does an unfortunate amount of backtracking.
1 parent b64ec22 commit 670c03d

File tree

2 files changed

+33
-28
lines changed

2 files changed

+33
-28
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
55
## Unreleased
66

7+
### Changed
8+
9+
- Optimized parsing a bit.
10+
711
## [v0.13.0](https://github.com/thibaudcolas/curlylint/releases/tag/v0.13.0) 2021-04-25
812

913
This release comes with a blog post! Read on [Quality-of-life improvements](https://www.curlylint.org/blog/quality-of-life-improvements).

curlylint/lint.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,48 @@
11
import sys
2+
from concurrent.futures import ProcessPoolExecutor, as_completed
23
from pathlib import Path
34
from typing import Set
45

56
import parsy
67

7-
from .check import check_file, check_files
8+
from .check import check_file
89
from .file import File
910
from .issue import Issue, IssueLocation
1011
from .parse import make_parser
1112

13+
PARSER = None
14+
15+
16+
def _make_parser(config):
17+
"""
18+
Process initializer
19+
"""
20+
global PARSER
21+
PARSER = make_parser(config)
22+
1223

1324
def get_parsy_error_location(error, file_path):
1425
line, column = parsy.line_info_at(error.stream, error.index)
1526
return IssueLocation(line=line, column=column, file_path=file_path)
1627

1728

18-
def parse_file(path_and_config):
29+
def parse_file(path):
1930
"""
2031
Returns a tuple ([Issue], File | None).
2132
"""
22-
path, config = path_and_config
23-
2433
with path.open("r") as f:
2534
source = f.read()
2635

27-
return parse_source(path, config, source)
28-
36+
return parse_source(path, source)
2937

30-
def parse_source(path: Path, config, source: str):
31-
parser = make_parser(config)
3238

39+
def parse_source(path: Path, source: str):
3340
try:
3441
file = File(
3542
path=path,
3643
source=source,
3744
lines=source.split("\n"),
38-
tree=parser["content"].parse(source),
45+
tree=PARSER["content"].parse(source),
3946
)
4047
return [], file
4148
except parsy.ParseError as error:
@@ -46,36 +53,30 @@ def parse_source(path: Path, config, source: str):
4653

4754
def lint(paths: Set[Path], config):
4855
issues = []
49-
files = []
50-
51-
from multiprocessing import Pool
52-
53-
pool = Pool()
54-
55-
parse_file_args = ((p, config) for p in paths)
56-
results = pool.map(parse_file, parse_file_args)
57-
for result in results:
58-
parse_issues, file = result
59-
issues += parse_issues
60-
if file is not None:
61-
files.append(file)
62-
63-
if config.get("parse_only", False):
64-
return issues
6556

57+
parse_only = config.get("parse_only", False)
6658
rules = config.get("rules")
6759

68-
if rules:
69-
issues += check_files(files, rules)
60+
with ProcessPoolExecutor(
61+
initializer=_make_parser, initargs=(config,)
62+
) as executor:
63+
futures = [executor.submit(parse_file, path) for path in paths]
64+
for future in as_completed(futures):
65+
parse_issues, file = future.result()
66+
issues.extend(parse_issues)
67+
if file is not None and not parse_only:
68+
issues.extend(check_file(file, rules))
7069

7170
return issues
7271

7372

7473
def lint_one(path: Path, config):
74+
_make_parser(config)
75+
7576
if not path.is_file() and str(path) == "-":
7677
source = sys.stdin.read()
7778
parse_issues, file = parse_source(
78-
config.get("stdin_filepath", path), config, source
79+
config.get("stdin_filepath", path), source
7980
)
8081
else:
8182
parse_issues, file = parse_file((path, config))

0 commit comments

Comments
 (0)