|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Analyze a snapshot PR diff file and classify all changes. |
| 3 | +
|
| 4 | +Usage: python3 analyze_diff.py <diff_file> |
| 5 | +
|
| 6 | +The diff file should be the raw output of `gh pr diff`. |
| 7 | +Outputs structured text sections for hash changes, merges, version upgrades, |
| 8 | +new data pruned from MDBX, and unexpected deletions. |
| 9 | +""" |
| 10 | + |
| 11 | +import re |
| 12 | +import sys |
| 13 | +from collections import defaultdict |
| 14 | + |
| 15 | + |
| 16 | +def parse_diff(path): |
| 17 | + removed = {} |
| 18 | + added = {} |
| 19 | + with open(path) as f: |
| 20 | + for line in f: |
| 21 | + line = line.rstrip("\n") |
| 22 | + if not line or line[0] not in ("+", "-"): |
| 23 | + continue |
| 24 | + m = re.match(r"^([+-])'([^']+)'\s*=\s*'([a-f0-9]+)'", line) |
| 25 | + if not m: |
| 26 | + continue |
| 27 | + sign, fname, hsh = m.group(1), m.group(2), m.group(3) |
| 28 | + if sign == "-": |
| 29 | + removed[fname] = hsh |
| 30 | + else: |
| 31 | + added[fname] = hsh |
| 32 | + return removed, added |
| 33 | + |
| 34 | + |
| 35 | +def parse_filename(fname): |
| 36 | + if fname in ("salt-blocks.txt", "salt-state.txt"): |
| 37 | + return {"cat": "other", "fname": fname} |
| 38 | + m = re.match(r"^caplin/(v[\d.]+)-(\d+)-(\d+)-([^.]+)\.(.+)$", fname) |
| 39 | + if m: |
| 40 | + return {"cat": "caplin", "ver": m[1], "s": int(m[2]), "e": int(m[3]), "dt": m[4], "ext": m[5]} |
| 41 | + m = re.match(r"^(accessor|domain|history|idx)/(v[\d.]+)-([^.]+)\.(\d+)-(\d+)\.(.+)$", fname) |
| 42 | + if m: |
| 43 | + return {"cat": m[1], "ver": m[2], "dt": m[3], "s": int(m[4]), "e": int(m[5]), "ext": m[6]} |
| 44 | + m = re.match(r"^(v[\d.]+)-(\d+)-(\d+)-(transactions-to-block|[^.]+)\.(.+)$", fname) |
| 45 | + if m: |
| 46 | + return {"cat": "blocks", "ver": m[1], "dt": m[4], "s": int(m[2]), "e": int(m[3]), "ext": m[5]} |
| 47 | + return {"cat": "unknown", "fname": fname} |
| 48 | + |
| 49 | + |
| 50 | +def hgroup(cat): |
| 51 | + if cat in ("accessor", "domain", "history", "idx"): |
| 52 | + return "state" |
| 53 | + if cat == "caplin": |
| 54 | + return "cl" |
| 55 | + if cat == "blocks": |
| 56 | + return "el" |
| 57 | + return "other" |
| 58 | + |
| 59 | + |
| 60 | +def classify(removed, added): |
| 61 | + # 1. Hash changes |
| 62 | + hash_changes = [] |
| 63 | + for fname in removed: |
| 64 | + if fname in added and removed[fname] != added[fname]: |
| 65 | + hash_changes.append((fname, removed[fname], added[fname])) |
| 66 | + hash_change_fnames = set(f for f, _, _ in hash_changes) |
| 67 | + |
| 68 | + # 2. Build groups by (cat, dt, ext) |
| 69 | + groups = defaultdict(lambda: {"rem": [], "add": []}) |
| 70 | + for f, h in removed.items(): |
| 71 | + if f in hash_change_fnames: |
| 72 | + continue |
| 73 | + info = parse_filename(f) |
| 74 | + if info["cat"] in ("other", "unknown"): |
| 75 | + continue |
| 76 | + groups[(info["cat"], info["dt"], info["ext"])]["rem"].append({**info, "fname": f}) |
| 77 | + for f, h in added.items(): |
| 78 | + if f in hash_change_fnames: |
| 79 | + continue |
| 80 | + info = parse_filename(f) |
| 81 | + if info["cat"] in ("other", "unknown"): |
| 82 | + continue |
| 83 | + groups[(info["cat"], info["dt"], info["ext"])]["add"].append({**info, "fname": f}) |
| 84 | + |
| 85 | + merges = [] |
| 86 | + version_upgrades_list = [] |
| 87 | + frontier = [] |
| 88 | + unexpected = [] |
| 89 | + explained_r = set() |
| 90 | + explained_a = set() |
| 91 | + |
| 92 | + for key, data in sorted(groups.items()): |
| 93 | + cat, dt, ext = key |
| 94 | + rem = sorted(data["rem"], key=lambda x: (x["s"], x["e"])) |
| 95 | + add = sorted(data["add"], key=lambda x: (x["s"], x["e"])) |
| 96 | + |
| 97 | + for a in add: |
| 98 | + covered = [r for r in rem if r["s"] >= a["s"] and r["e"] <= a["e"] and r["fname"] not in explained_r] |
| 99 | + if covered: |
| 100 | + old_vers = list(set(r["ver"] for r in covered)) |
| 101 | + is_vu = a["ver"] not in set(r["ver"] for r in covered) |
| 102 | + info = { |
| 103 | + "cat": cat, "dt": dt, "ext": ext, |
| 104 | + "rem_ranges": [(r["s"], r["e"], r["ver"]) for r in covered], |
| 105 | + "add_range": (a["s"], a["e"], a["ver"]), |
| 106 | + "is_vu": is_vu, "old_vers": old_vers, "new_ver": a["ver"], |
| 107 | + } |
| 108 | + if is_vu: |
| 109 | + version_upgrades_list.append(info) |
| 110 | + if len(covered) >= 2 or (len(covered) == 1 and (covered[0]["s"] != a["s"] or covered[0]["e"] != a["e"])): |
| 111 | + merges.append(info) |
| 112 | + for r in covered: |
| 113 | + explained_r.add(r["fname"]) |
| 114 | + explained_a.add(a["fname"]) |
| 115 | + |
| 116 | + for a in add: |
| 117 | + if a["fname"] not in explained_a: |
| 118 | + if not any(r["s"] < a["e"] and r["e"] > a["s"] for r in rem): |
| 119 | + frontier.append({"cat": cat, "dt": dt, "ext": ext, "s": a["s"], "e": a["e"], "ver": a["ver"], "fname": a["fname"]}) |
| 120 | + explained_a.add(a["fname"]) |
| 121 | + |
| 122 | + for r in rem: |
| 123 | + if r["fname"] not in explained_r: |
| 124 | + unexpected.append({"cat": cat, "dt": dt, "ext": ext, "s": r["s"], "e": r["e"], "ver": r["ver"], "fname": r["fname"]}) |
| 125 | + |
| 126 | + return hash_changes, merges, version_upgrades_list, frontier, unexpected |
| 127 | + |
| 128 | + |
| 129 | +def print_report(removed, added, hash_changes, merges, version_upgrades_list, frontier, unexpected): |
| 130 | + # Hash changes |
| 131 | + print("=== HASH CHANGES ===") |
| 132 | + for f, oh, nh in hash_changes: |
| 133 | + p = parse_filename(f) |
| 134 | + print(f" [{hgroup(p['cat'])}] {f} old={oh} new={nh}") |
| 135 | + print(f" count={len(hash_changes)}") |
| 136 | + |
| 137 | + # Unexpected deletions |
| 138 | + print("=== UNEXPECTED DELETIONS ===") |
| 139 | + for u in unexpected: |
| 140 | + print(f" [{hgroup(u['cat'])}] {u['fname']}") |
| 141 | + print(f" count={len(unexpected)}") |
| 142 | + |
| 143 | + # Merges table |
| 144 | + print("=== MERGES TABLE ===") |
| 145 | + mp = defaultdict(list) |
| 146 | + for m in merges: |
| 147 | + rr = tuple((s, e) for s, e, v in m["rem_ranges"]) |
| 148 | + ar = (m["add_range"][0], m["add_range"][1]) |
| 149 | + ov = tuple(sorted(m["old_vers"])) |
| 150 | + pk = (hgroup(m["cat"]), m["cat"], rr, ar, m["new_ver"], ov, m["is_vu"]) |
| 151 | + mp[pk].append(f"{m['dt']} (.{m['ext']})") |
| 152 | + |
| 153 | + for (hg, cat, rr, ar, nv, ov, is_vu), items in sorted(mp.items()): |
| 154 | + old_r = ", ".join(f"{s}-{e}" for s, e in rr) |
| 155 | + note = "" |
| 156 | + if is_vu: |
| 157 | + note = f"cross-version: absorbs {','.join(ov)}" |
| 158 | + else: |
| 159 | + mixed = [v for v in ov if v != nv] |
| 160 | + if mixed: |
| 161 | + note = f"cross-version: absorbs {','.join(mixed)}" |
| 162 | + types_str = ", ".join(sorted(set(items))) |
| 163 | + print(f" [{hg}] | {cat} | {types_str} | {old_r} | {ar[0]}-{ar[1]} [{nv}] | {note}") |
| 164 | + |
| 165 | + # Version upgrades |
| 166 | + print("=== VERSION UPGRADES ===") |
| 167 | + vup = defaultdict(list) |
| 168 | + for vu in version_upgrades_list: |
| 169 | + vk = (hgroup(vu["cat"]), vu["cat"], tuple(sorted(vu["old_vers"])), vu["new_ver"]) |
| 170 | + rr = [(s, e) for s, e, v in vu["rem_ranges"]] |
| 171 | + vup[vk].append(f"{vu['dt']} (.{vu['ext']}): {', '.join(f'{s}-{e}' for s, e in rr)} -> {vu['add_range'][0]}-{vu['add_range'][1]}") |
| 172 | + for (hg, cat, ov, nv), items in sorted(vup.items()): |
| 173 | + print(f" [{hg}] {cat}: {','.join(ov)} -> {nv}") |
| 174 | + for i in sorted(items): |
| 175 | + print(f" {i}") |
| 176 | + |
| 177 | + # Frontier / new data pruned from MDBX |
| 178 | + print("=== NEW DATA PRUNED FROM MDBX ===") |
| 179 | + fg = defaultdict(list) |
| 180 | + for f in frontier: |
| 181 | + fg[(hgroup(f["cat"]), f["cat"])].append(f) |
| 182 | + for (hg, cat), items in sorted(fg.items()): |
| 183 | + items_sorted = sorted(items, key=lambda x: (x["ext"], x["dt"], x["s"], x["e"], x["ver"])) |
| 184 | + print(f" [{hg}] {cat}: {len(items)} files") |
| 185 | + for item in items_sorted: |
| 186 | + print(f" {item['fname']}") |
| 187 | + |
| 188 | + # Totals |
| 189 | + print(f"=== TOTALS: removed={len(removed)} added={len(added)} hash_changes={len(hash_changes)} merges={len(merges)} vu={len(version_upgrades_list)} frontier={len(frontier)} unexpected={len(unexpected)} ===") |
| 190 | + |
| 191 | + |
| 192 | +def main(): |
| 193 | + if len(sys.argv) != 2: |
| 194 | + print(f"Usage: {sys.argv[0]} <diff_file>", file=sys.stderr) |
| 195 | + sys.exit(1) |
| 196 | + |
| 197 | + diff_file = sys.argv[1] |
| 198 | + removed, added = parse_diff(diff_file) |
| 199 | + hash_changes, merges, version_upgrades_list, frontier, unexpected = classify(removed, added) |
| 200 | + print_report(removed, added, hash_changes, merges, version_upgrades_list, frontier, unexpected) |
| 201 | + |
| 202 | + |
| 203 | +if __name__ == "__main__": |
| 204 | + main() |
0 commit comments