Optimize parsing

adamchainz · adamchainz · commit 670c03d43420 · 2022-03-11T09:10:18.000Z
A few changes:

1. Switch from `multiprocessing` to `concurrent.futures` with `ProcessPoolExecutor`. This is just to make the code easier to work with.
2. Perform `check_file()` as results return to the main process, in order to reduce peak memory usage. Previously all parsed files were kept in memory before being checked, leading to massive memory usage.
3. Construct parsers once per process, rather than once per file. Previously parser construction took ~5% of the runtime, this reduces it to a constant amount.

Benchmarked on a project with 238 templates.

Before:

```
$ time curlylint templates/**/*.html
All done! ✨ 🍰 ✨

curlylint templates/**/*.html  352.25s user 3.37s system 999% cpu 35.575 total
```

After:

```
$ time curlylint templates/**/*.html
All done! ✨ 🍰 ✨

curlylint templates/**/*.html  324.22s user 2.79s system 995% cpu 32.858 total
```

~8% of the time saved.

The parser remains quite slow, I think it does an unfortunate amount of backtracking.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@
 
 ## Unreleased
 
+### Changed
+
+- Optimized parsing a bit.
+
 ## [v0.13.0](https://github.com/thibaudcolas/curlylint/releases/tag/v0.13.0) 2021-04-25
 
 This release comes with a blog post! Read on [Quality-of-life improvements](https://www.curlylint.org/blog/quality-of-life-improvements).
diff --git a/curlylint/lint.py b/curlylint/lint.py
@@ -1,41 +1,48 @@
 import sys
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 from typing import Set
 
 import parsy
 
-from .check import check_file, check_files
+from .check import check_file
 from .file import File
 from .issue import Issue, IssueLocation
 from .parse import make_parser
 
+PARSER = None
+
+
+def _make_parser(config):
+    """
+    Process initializer
+    """
+    global PARSER
+    PARSER = make_parser(config)
+
 
 def get_parsy_error_location(error, file_path):
     line, column = parsy.line_info_at(error.stream, error.index)
     return IssueLocation(line=line, column=column, file_path=file_path)
 
 
-def parse_file(path_and_config):
+def parse_file(path):
     """
     Returns a tuple ([Issue], File | None).
     """
-    path, config = path_and_config
-
     with path.open("r") as f:
         source = f.read()
 
-    return parse_source(path, config, source)
-
+    return parse_source(path, source)
 
-def parse_source(path: Path, config, source: str):
-    parser = make_parser(config)
 
+def parse_source(path: Path, source: str):
     try:
         file = File(
             path=path,
             source=source,
             lines=source.split("\n"),
-            tree=parser["content"].parse(source),
+            tree=PARSER["content"].parse(source),
         )
         return [], file
     except parsy.ParseError as error:
@@ -46,36 +53,30 @@ def parse_source(path: Path, config, source: str):
 
 def lint(paths: Set[Path], config):
     issues = []
-    files = []
-
-    from multiprocessing import Pool
-
-    pool = Pool()
-
-    parse_file_args = ((p, config) for p in paths)
-    results = pool.map(parse_file, parse_file_args)
-    for result in results:
-        parse_issues, file = result
-        issues += parse_issues
-        if file is not None:
-            files.append(file)
-
-    if config.get("parse_only", False):
-        return issues
 
+    parse_only = config.get("parse_only", False)
     rules = config.get("rules")
 
-    if rules:
-        issues += check_files(files, rules)
+    with ProcessPoolExecutor(
+        initializer=_make_parser, initargs=(config,)
+    ) as executor:
+        futures = [executor.submit(parse_file, path) for path in paths]
+        for future in as_completed(futures):
+            parse_issues, file = future.result()
+            issues.extend(parse_issues)
+            if file is not None and not parse_only:
+                issues.extend(check_file(file, rules))
 
     return issues
 
 
 def lint_one(path: Path, config):
+    _make_parser(config)
+
     if not path.is_file() and str(path) == "-":
         source = sys.stdin.read()
         parse_issues, file = parse_source(
-            config.get("stdin_filepath", path), config, source
+            config.get("stdin_filepath", path), source
         )
     else:
         parse_issues, file = parse_file((path, config))