From 670c03d43420c3b80386b1462443c7f669b420c4 Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Fri, 11 Mar 2022 07:57:24 +0000 Subject: [PATCH] Optimize parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few changes: 1. Switch from `multiprocessing` to `concurrent.futures` with `ProcessPoolExecutor`. This is just to make the code easier to work with. 2. Perform `check_file()` as results return to the main process, in order to reduce peak memory usage. Previously all parsed files were kept in memory before being checked, leading to massive memory usage. 3. Construct parsers once per process, rather than once per file. Previously parser construction took ~5% of the runtime, this reduces it to a constant amount. Benchmarked on a project with 238 templates. Before: ``` $ time curlylint templates/**/*.html All done! ✨ 🍰 ✨ curlylint templates/**/*.html 352.25s user 3.37s system 999% cpu 35.575 total ``` After: ``` $ time curlylint templates/**/*.html All done! ✨ 🍰 ✨ curlylint templates/**/*.html 324.22s user 2.79s system 995% cpu 32.858 total ``` ~8% of the time saved. The parser remains quite slow, I think it does an unfortunate amount of backtracking. --- CHANGELOG.md | 4 ++++ curlylint/lint.py | 57 ++++++++++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1624394..d63e572 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ ## Unreleased +### Changed + +- Optimized parsing a bit. + ## [v0.13.0](https://github.com/thibaudcolas/curlylint/releases/tag/v0.13.0) 2021-04-25 This release comes with a blog post! Read on [Quality-of-life improvements](https://www.curlylint.org/blog/quality-of-life-improvements). diff --git a/curlylint/lint.py b/curlylint/lint.py index 52bf4d2..ed85b92 100644 --- a/curlylint/lint.py +++ b/curlylint/lint.py @@ -1,41 +1,48 @@ import sys +from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path from typing import Set import parsy -from .check import check_file, check_files +from .check import check_file from .file import File from .issue import Issue, IssueLocation from .parse import make_parser +PARSER = None + + +def _make_parser(config): + """ + Process initializer + """ + global PARSER + PARSER = make_parser(config) + def get_parsy_error_location(error, file_path): line, column = parsy.line_info_at(error.stream, error.index) return IssueLocation(line=line, column=column, file_path=file_path) -def parse_file(path_and_config): +def parse_file(path): """ Returns a tuple ([Issue], File | None). """ - path, config = path_and_config - with path.open("r") as f: source = f.read() - return parse_source(path, config, source) - + return parse_source(path, source) -def parse_source(path: Path, config, source: str): - parser = make_parser(config) +def parse_source(path: Path, source: str): try: file = File( path=path, source=source, lines=source.split("\n"), - tree=parser["content"].parse(source), + tree=PARSER["content"].parse(source), ) return [], file except parsy.ParseError as error: @@ -46,36 +53,30 @@ def parse_source(path: Path, config, source: str): def lint(paths: Set[Path], config): issues = [] - files = [] - - from multiprocessing import Pool - - pool = Pool() - - parse_file_args = ((p, config) for p in paths) - results = pool.map(parse_file, parse_file_args) - for result in results: - parse_issues, file = result - issues += parse_issues - if file is not None: - files.append(file) - - if config.get("parse_only", False): - return issues + parse_only = config.get("parse_only", False) rules = config.get("rules") - if rules: - issues += check_files(files, rules) + with ProcessPoolExecutor( + initializer=_make_parser, initargs=(config,) + ) as executor: + futures = [executor.submit(parse_file, path) for path in paths] + for future in as_completed(futures): + parse_issues, file = future.result() + issues.extend(parse_issues) + if file is not None and not parse_only: + issues.extend(check_file(file, rules)) return issues def lint_one(path: Path, config): + _make_parser(config) + if not path.is_file() and str(path) == "-": source = sys.stdin.read() parse_issues, file = parse_source( - config.get("stdin_filepath", path), config, source + config.get("stdin_filepath", path), source ) else: parse_issues, file = parse_file((path, config))