|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import subprocess |
| 4 | +import sys |
| 5 | +from dataclasses import dataclass |
| 6 | +from itertools import chain |
| 7 | +from pathlib import Path |
| 8 | +from typing import Dict, Iterable, List, Literal, Optional, Tuple |
| 9 | + |
| 10 | + |
| 11 | +@dataclass |
| 12 | +class GitDiffTreeRecord: |
| 13 | + """Represents a line of output from 'git diff-tree'""" |
| 14 | + |
| 15 | + src_mode: str |
| 16 | + src_hash: str |
| 17 | + dst_mode: str |
| 18 | + dst_hash: str |
| 19 | + src_path: Path |
| 20 | + dst_path: Optional[Path] |
| 21 | + status: Literal["A", "C", "D", "M", "R", "T", "U", "X"] |
| 22 | + score: Optional[int] |
| 23 | + |
| 24 | + |
| 25 | +@dataclass |
| 26 | +class GitChange: |
| 27 | + diff_record: GitDiffTreeRecord |
| 28 | + bytes_changed: int |
| 29 | + |
| 30 | + |
| 31 | +def parse_git_diff_tree_output(output: str) -> List[GitDiffTreeRecord]: |
| 32 | + """Parses the output of `git diff-tree` as described in the "Raw Output" section |
| 33 | + of the man page |
| 34 | + """ |
| 35 | + |
| 36 | + def make_record(line: str) -> GitDiffTreeRecord: |
| 37 | + src_mode, dst_mode, src_hash, dst_hash, rest = line[1:].split(" ", maxsplit=4) |
| 38 | + status_score_and_paths = rest.split("\t") |
| 39 | + return GitDiffTreeRecord( |
| 40 | + src_mode=src_mode, |
| 41 | + src_hash=src_hash, |
| 42 | + dst_mode=dst_mode, |
| 43 | + dst_hash=dst_hash, |
| 44 | + status=status_score_and_paths[0][0], |
| 45 | + score=int(status_score_and_paths[0][1:]) if len(status_score_and_paths[0]) > 1 else None, |
| 46 | + src_path=Path(status_score_and_paths[1]), |
| 47 | + dst_path=Path(status_score_and_paths[2]) if len(status_score_and_paths) >= 3 else None, |
| 48 | + ) |
| 49 | + |
| 50 | + return [make_record(line) for line in output.splitlines(keepends=False)] |
| 51 | + |
| 52 | + |
| 53 | +def get_blob_sizes(hashes: Iterable[str]) -> Dict[str, Optional[int]]: |
| 54 | + """Fetches the sizes, in bytes, of git blobs |
| 55 | +
|
| 56 | + :param hashes: A iterable of git blob hashes |
| 57 | + :type hashes: Iterable[str] |
| 58 | +
|
| 59 | + :return: A dictionary that mapping blob hashes to their size if the blob exists, |
| 60 | + or None otherwise |
| 61 | + :rtype: Dict[str, Optional[int]] |
| 62 | + """ |
| 63 | + input = "\n".join(set(hashes)) |
| 64 | + cat_file_output = subprocess.run( |
| 65 | + ["git", "cat-file", "--batch-check"], |
| 66 | + input=input, |
| 67 | + check=True, |
| 68 | + text=True, |
| 69 | + capture_output=True, |
| 70 | + ).stdout |
| 71 | + |
| 72 | + def make_object_size_tuple(line: str) -> Tuple[str, Optional[int]]: |
| 73 | + hash, *_, size = line.split() |
| 74 | + return (hash, int(size) if size != "missing" else None) |
| 75 | + |
| 76 | + return dict(make_object_size_tuple(line) for line in cat_file_output.splitlines(keepends=False)) |
| 77 | + |
| 78 | + |
| 79 | +def get_file_size_differences(commit_range: str) -> Dict[Path, GitChange]: |
| 80 | + """Computes the size difference, in bytes, of files changed between two commits |
| 81 | +
|
| 82 | + :param commit_range: A git commit range (e.g. HEAD~3..HEAD) |
| 83 | + :type commit_range: str |
| 84 | +
|
| 85 | + :return: A dictionary mapping paths (relative to repository root) to size |
| 86 | + differences. |
| 87 | + :rtype: dict[Path, GitChange] |
| 88 | + """ |
| 89 | + changed_records = parse_git_diff_tree_output( |
| 90 | + subprocess.run( |
| 91 | + ["git", "diff-tree", "-r", commit_range], |
| 92 | + capture_output=True, |
| 93 | + text=True, |
| 94 | + check=True, |
| 95 | + ).stdout |
| 96 | + ) |
| 97 | + |
| 98 | + sizes = get_blob_sizes(chain.from_iterable((idx.src_hash, idx.dst_hash) for idx in changed_records)) |
| 99 | + |
| 100 | + assert {"A", "D", "M"}.issuperset(idx.status for idx in changed_records) |
| 101 | + |
| 102 | + def as_int(maybe_num: Optional[int]) -> int: |
| 103 | + return maybe_num or 0 |
| 104 | + |
| 105 | + return { |
| 106 | + x.src_path: GitChange( |
| 107 | + diff_record=x, |
| 108 | + bytes_changed=as_int(sizes[x.dst_hash]) - as_int(sizes[x.src_hash]), |
| 109 | + ) |
| 110 | + for x in changed_records |
| 111 | + } |
| 112 | + |
| 113 | + |
| 114 | +def main( |
| 115 | + commit_range: str, |
| 116 | + quiet: bool = False, |
| 117 | + limit: Optional[int] = None, |
| 118 | + show_n_largest_files: int = 30, |
| 119 | +) -> Literal[0, 1]: |
| 120 | + size_differences = get_file_size_differences(commit_range) |
| 121 | + cumulative_size_difference = sum(x.bytes_changed for x in size_differences.values()) |
| 122 | + exceeds_limit = limit is not None and cumulative_size_difference > limit |
| 123 | + |
| 124 | + def bytes_diff(num: int) -> str: |
| 125 | + return ("+" if num >= 0 else "") + human_friendly_bytes(num) |
| 126 | + |
| 127 | + if not quiet: |
| 128 | + print(f"Total file size difference for commit range '{commit_range}': ") |
| 129 | + print(f"\t{bytes_diff(cumulative_size_difference)}", end="") |
| 130 | + print(f" (Exceeds set limit of {bytes_diff(limit)})" if exceeds_limit else "") |
| 131 | + |
| 132 | + largest_n_sizes = sorted(size_differences.items(), key=lambda x: x[1].bytes_changed, reverse=True)[ |
| 133 | + :show_n_largest_files |
| 134 | + ] |
| 135 | + |
| 136 | + if largest_n_sizes: |
| 137 | + print("") |
| 138 | + print(f"Largest {len(largest_n_sizes)} filesize differences:") |
| 139 | + |
| 140 | + for path, val in largest_n_sizes: |
| 141 | + print(f"\t{bytes_diff(val.bytes_changed)}\t{path}") |
| 142 | + |
| 143 | + return 1 if exceeds_limit else 0 |
| 144 | + |
| 145 | + |
| 146 | +def num_bytes(arg: str) -> int: |
| 147 | + """Converts a string to a number of bytes""" |
| 148 | + error = argparse.ArgumentTypeError(f"'{arg}' cannot be parsed into a number of bytes") |
| 149 | + try: |
| 150 | + return int(arg) |
| 151 | + except ValueError: |
| 152 | + pass |
| 153 | + |
| 154 | + if len(arg) < 3: |
| 155 | + raise error |
| 156 | + |
| 157 | + num, suffix = (arg[:-2], arg[-2:]) |
| 158 | + shift_values = { |
| 159 | + "KB": 1, |
| 160 | + "MB": 2, |
| 161 | + "GB": 3, |
| 162 | + "TB": 4, |
| 163 | + "PB": 5, |
| 164 | + "EB": 6, |
| 165 | + "ZB": 7, |
| 166 | + "YB": 8, |
| 167 | + } |
| 168 | + |
| 169 | + shift = shift_values.get(suffix) |
| 170 | + |
| 171 | + if shift is None: |
| 172 | + raise error |
| 173 | + try: |
| 174 | + return int(num) << (shift * 10) |
| 175 | + except ValueError as e: |
| 176 | + raise error from e |
| 177 | + |
| 178 | + |
| 179 | +def human_friendly_bytes(num: int) -> str: |
| 180 | + """Prints a number of bytes as a human friendly string""" |
| 181 | + for prefix in ["", "K", "M", "G", "T", "P", "E", "Z"]: |
| 182 | + if abs(num) < 1024.0: |
| 183 | + return f"{num:.1f}{prefix}B" |
| 184 | + num /= 1024.0 |
| 185 | + return f"{num:.1f}YB" |
| 186 | + |
| 187 | + |
| 188 | +if __name__ == "__main__": |
| 189 | + parser = argparse.ArgumentParser( |
| 190 | + description="A program that summarizes the file size " + "differences between two git commits." |
| 191 | + ) |
| 192 | + parser.add_argument( |
| 193 | + "commit_range", |
| 194 | + default="HEAD^..HEAD", |
| 195 | + type=str, |
| 196 | + help="Commit range to commit the diff for (e.g. HEAD~3..HEAD)", |
| 197 | + ) |
| 198 | + parser.add_argument("--quiet", action="store_true", help="Silence all output") |
| 199 | + parser.add_argument( |
| 200 | + "--limit", |
| 201 | + type=num_bytes, |
| 202 | + help="Exit non-zero if total changes exceeds this value. " |
| 203 | + + "Can be a raw number of bytes (e.g. 65536) or a suffixed value (e.g 2MB)", |
| 204 | + ) |
| 205 | + parser.add_argument( |
| 206 | + "--show-n-largest-files", |
| 207 | + type=int, |
| 208 | + help="Show this many of the largest files in diff", |
| 209 | + default=30, |
| 210 | + ) |
| 211 | + sys.exit(main(**vars(parser.parse_args()))) |
0 commit comments