From 646218302b2a60059403a0565023b4e7d71c2551 Mon Sep 17 00:00:00 2001 From: Amethyst Reese Date: Sun, 11 Feb 2024 18:20:19 -0800 Subject: [PATCH 1/2] Simple benchmark of ufmt_file and ufmt_paths --- makefile | 4 ++ ufmt/tests/benchmark.py | 90 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 ufmt/tests/benchmark.py diff --git a/makefile b/makefile index d14a018..d652e84 100644 --- a/makefile +++ b/makefile @@ -35,6 +35,10 @@ test: deps: python -m pessimist -c 'python -m $(SRCS).tests' --requirements= --fast . +.PHONY: benchmark +benchmark: + python -m $(SRCS).tests.benchmark + .PHONY: html html: .venv README.md docs/*.rst docs/conf.py source $(ACTIVATE) && sphinx-build -ab html docs html diff --git a/ufmt/tests/benchmark.py b/ufmt/tests/benchmark.py new file mode 100644 index 0000000..9805028 --- /dev/null +++ b/ufmt/tests/benchmark.py @@ -0,0 +1,90 @@ +# Copyright Amethyst Reese, Tim Hatch +# Licensed under the MIT license + +import time +from pathlib import Path +from typing import Any, List + +from typing_extensions import Self + +from ufmt import ufmt_file, ufmt_paths + +ROOT = Path(__file__).parent.parent.parent + + +class Timer: + def __init__(self, name: str) -> None: + self.name = name + self.totals: List[int] = [] + + @classmethod + def fields(self) -> str: + headline = f"{'name':^40} {'min':^10} {'mean':^10} {'max':^10}" + underline = "-" * len(headline) + return f"{headline}\n{underline}" + + def __str__(self) -> str: + short = min(self.totals) + long = max(self.totals) + avg = sum(self.totals) // len(self.totals) + fields = " ".join(f"{value // 1000:>7} µs" for value in (short, avg, long)) + return f"{self.name + ':':<40} {fields}" + + def __enter__(self) -> Self: + self.before = time.monotonic_ns() + return self + + def __exit__(self, *args: Any) -> None: + after = time.monotonic_ns() + self.totals.append(after - self.before) + + +def benchmark() -> None: + print("starting benchmark...") + + ufmt_dir = ROOT / "ufmt" + ufmt_core = ufmt_dir / "core.py" + assert ufmt_dir.is_dir(), f"{ufmt_dir} not found, must run benchmark from repo" + + print() + print(Timer.fields()) + + timer = Timer("ufmt_file") + for _ in range(5): + with timer: + ufmt_file(ufmt_core, dry_run=True) + print(timer) + + timer = Timer("ufmt_file, diff=True") + for _ in range(5): + with timer: + ufmt_file(ufmt_core, dry_run=True, diff=True) + print(timer) + + timer = Timer("ufmt_file, return_content=True") + for _ in range(5): + with timer: + ufmt_file(ufmt_core, dry_run=True, return_content=True) + print(timer) + + timer = Timer("ufmt_paths") + for _ in range(5): + with timer: + list(ufmt_paths([ufmt_dir], dry_run=True)) + print(timer) + + timer = Timer("ufmt_paths, diff=True") + for _ in range(5): + with timer: + list(ufmt_paths([ufmt_dir], dry_run=True, diff=True)) + print(timer) + + timer = Timer("ufmt_paths, return_content=True") + for _ in range(5): + with timer: + list(ufmt_paths([ufmt_dir], dry_run=True, diff=True)) + print(timer) + + +if __name__ == "__main__": + benchmark() From d2a25589e56db8a55c8885ae2271a4372f078cff Mon Sep 17 00:00:00 2001 From: Amethyst Reese Date: Thu, 15 Feb 2024 17:33:46 -0800 Subject: [PATCH 2/2] Experiment to cache results --- ufmt/core.py | 58 ++++++++++++++++++++++++++---------- ufmt/tests/core.py | 2 ++ ufmt/types.py | 1 + ufmt/util.py | 73 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 117 insertions(+), 17 deletions(-) diff --git a/ufmt/core.py b/ufmt/core.py index d3274ab..caf093d 100644 --- a/ufmt/core.py +++ b/ufmt/core.py @@ -30,7 +30,13 @@ UsortConfig, UsortConfigFactory, ) -from .util import make_black_config, normalize_result, read_file, write_file +from .util import ( + make_black_config, + normalize_result, + read_file, + ResultCache, + write_file, +) LOG = logging.getLogger(__name__) @@ -184,24 +190,28 @@ def ufmt_file( the skip exception, or ``True`` if no message is given. """ path = path.resolve() - black_config = (black_config_factory or make_black_config)(path) - usort_config = (usort_config_factory or UsortConfig.find)(path) - LOG.debug(f"Checking {path}") + cache = ResultCache() result = Result(path) try: src_contents, encoding, newline = read_file(path) - dst_contents = ufmt_bytes( - path, - src_contents, - encoding=encoding, - black_config=black_config, - usort_config=usort_config, - pre_processor=pre_processor, - post_processor=post_processor, - ) + if cache.check(path, src_contents): + result.cached = True + dst_contents = src_contents + else: + black_config = (black_config_factory or make_black_config)(path) + usort_config = (usort_config_factory or UsortConfig.find)(path) + dst_contents = ufmt_bytes( + path, + src_contents, + encoding=encoding, + black_config=black_config, + usort_config=usort_config, + pre_processor=pre_processor, + post_processor=post_processor, + ) except SkipFormatting as e: dst_contents = src_contents result.skipped = str(e) or True @@ -219,7 +229,13 @@ def ufmt_file( result.before = src_result result.after = dst_result - if src_contents != dst_contents: + if result.cached: + pass + + elif src_contents == dst_contents: + cache.mark(path, src_contents) + + else: result.changed = True if diff: @@ -234,6 +250,7 @@ def ufmt_file( try: write_file(path, dst_contents, newline=newline) result.written = True + cache.mark(path, dst_contents) except Exception as e: result.error = e @@ -373,6 +390,9 @@ def ufmt_paths( Trailrunner() if concurrency is None else Trailrunner(concurrency=concurrency) ) + cache = ResultCache() + cache.prepare() + def generate_paths() -> Generator[Path, None, None]: """ yield paths to format, using trailrunner to walk directories and exclude paths @@ -382,7 +402,13 @@ def generate_paths() -> Generator[Path, None, None]: LOG.warning("Cannot mix stdin ('-') with normal paths, ignoring") continue config = ufmt_config(path, root) - yield from runner.walk(path, excludes=config.excludes) + for p in runner.walk(path, excludes=config.excludes): + p = p.resolve() + content, _, _ = read_file(p) + if cache.check(p, content): + continue + + yield p fn = partial( ufmt_file, @@ -426,3 +452,5 @@ def generate_paths() -> Generator[Path, None, None]: combined = chain([first, second], gen) # combine first, second, and the rest for _, result in runner.run_iter(combined, fn): yield result + + cache.cleanup() diff --git a/ufmt/tests/core.py b/ufmt/tests/core.py index 4b5d7aa..d4e12d5 100644 --- a/ufmt/tests/core.py +++ b/ufmt/tests/core.py @@ -25,11 +25,13 @@ ) FAKE_CONFIG = """ + [tool.ufmt] excludes = [ "foo/frob/", "__init__.py", ] + """ POORLY_FORMATTED_CODE = """\ diff --git a/ufmt/types.py b/ufmt/types.py index 2294ff1..beb59eb 100644 --- a/ufmt/types.py +++ b/ufmt/types.py @@ -54,6 +54,7 @@ class Result: path: Path changed: bool = False written: bool = False + cached: bool = False skipped: Union[bool, str] = False diff: Optional[str] = None error: Optional[Exception] = None diff --git a/ufmt/util.py b/ufmt/util.py index 043ba5e..c7a5d3b 100644 --- a/ufmt/util.py +++ b/ufmt/util.py @@ -2,13 +2,17 @@ # Licensed under the MIT license import os +import sqlite3 +import time import tokenize +import zlib +from contextlib import closing from pathlib import Path -from typing import Tuple +from typing import Optional, Tuple from black import find_pyproject_toml, parse_pyproject_toml, TargetVersion -from .types import BlackConfig, Encoding, FileContent, Newline +from .types import BlackConfig, Encoding, FileContent, Newline, SkipFormatting def make_black_config(path: Path) -> BlackConfig: @@ -97,3 +101,68 @@ def enable_libcst_native() -> None: os.environ["LIBCST_PARSER_TYPE"] = "native" except ImportError: # pragma: nocover pass + + +class ResultCache: + def __init__( + self, + cache_path: Optional[Path] = None, + threshold: int = 7 * 86400, + ) -> None: + if cache_path is None: + cache_path = Path.cwd() / ".ufmt_cache" / "cache.db" + cache_path.parent.mkdir(exist_ok=True) + self.cache_path = cache_path + self.threshold = threshold + + def prepare(self) -> None: + with closing(sqlite3.connect(self.cache_path)) as db: + with db: + db.execute( + """ + create table if not exists clean ( + `path` text, + `crc` integer, + `seen` integer, + unique(`path`, `crc`) + )""" + ) + + def cleanup(self) -> None: + with closing(sqlite3.connect(self.cache_path)) as db: + with db: + db.execute( + """ + delete from clean where rowid in ( + select rowid from clean where `seen` < ? + ) + """, + (int(time.time()) - self.threshold,), + ) + + def check(self, path: Path, content: FileContent) -> bool: + path_str = path.as_posix() + crc = zlib.adler32(content) + with closing(sqlite3.connect(self.cache_path)) as db: + with db: + cursor = db.execute( + "select * from clean where `path` = ? and `crc` = ?", + (path_str, crc), + ) + if cursor.fetchone(): + db.execute( + "update clean set `seen` = ? where `path` = ? and `crc` = ?", + (int(time.time()), path_str, crc), + ) + return True + return False + + def mark(self, path: Path, content: FileContent) -> None: + path_str = path.as_posix() + crc = zlib.adler32(content) + with closing(sqlite3.connect(self.cache_path)) as db: + with db: + db.execute( + "insert into clean (`path`, `crc`, `seen`) values (?, ?, ?) on conflict (`path`, `crc`) do update set `seen` = excluded.`seen`", + (path_str, crc, int(time.time())), + )