Skip to content

Commit c106df8

Browse files
chakru-rclaude
andauthored
fix(precommit): smarter PDL schema version bump detection (datahub-project#17459)
Co-authored-by: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent 2ebfcdd commit c106df8

2 files changed

Lines changed: 152 additions & 12 deletions

File tree

.github/scripts/bump_schema_versions.py

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,66 @@ def detect_default_branch() -> str:
6666
return "master"
6767

6868

69-
def get_changed_pdl_files(base_branch: str) -> list[str]:
70-
"""Return repo-relative paths of PDL files that differ from base_branch."""
69+
def get_merge_base(remote_ref: str) -> str:
70+
"""Return the merge-base commit SHA between HEAD and remote_ref."""
71+
result = subprocess.run(
72+
["git", "merge-base", "HEAD", remote_ref],
73+
capture_output=True,
74+
text=True,
75+
)
76+
if result.returncode != 0:
77+
print(
78+
f"Error finding merge-base with {remote_ref}: {result.stderr}",
79+
file=sys.stderr,
80+
)
81+
sys.exit(1)
82+
return result.stdout.strip()
83+
84+
85+
def get_base_pdl_changes(merge_base: str, remote_ref: str) -> list[str]:
86+
"""Return PDL files that changed on remote_ref since merge_base.
87+
88+
Used to detect whether any of the PDL files touched on this branch have
89+
also moved on the base branch — if so, the branch must be rebased before
90+
schemaVersion can be bumped correctly.
91+
"""
7192
try:
7293
result = subprocess.run(
7394
[
7495
"git",
7596
"diff",
7697
"--name-only",
7798
"--diff-filter=ACM",
78-
base_branch,
99+
merge_base,
100+
remote_ref,
101+
"--",
102+
"*.pdl",
103+
],
104+
capture_output=True,
105+
text=True,
106+
check=True,
107+
)
108+
return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
109+
except subprocess.CalledProcessError as e:
110+
print(f"Error getting base branch PDL changes: {e.stderr}", file=sys.stderr)
111+
sys.exit(1)
112+
113+
114+
def get_changed_pdl_files(base_ref: str) -> list[str]:
115+
"""Return repo-relative paths of PDL files that differ between base_ref and the working tree.
116+
117+
Compares base_ref against the working tree (not HEAD) so that uncommitted
118+
changes are included — this function runs inside a pre-commit hook.
119+
base_ref may be a branch name, remote tracking ref, or commit SHA.
120+
"""
121+
try:
122+
result = subprocess.run(
123+
[
124+
"git",
125+
"diff",
126+
"--name-only",
127+
"--diff-filter=ACM",
128+
base_ref,
79129
"--",
80130
"*.pdl",
81131
],
@@ -544,14 +594,33 @@ def main() -> int:
544594
)
545595
args = parser.parse_args()
546596

547-
directly_changed = get_changed_pdl_files(args.base_branch)
597+
remote_ref = f"refs/remotes/origin/{args.base_branch}"
598+
merge_base = get_merge_base(remote_ref)
599+
600+
directly_changed = get_changed_pdl_files(merge_base)
548601

549602
if not directly_changed:
550603
print("No changed PDL files found.")
551604
return 0
552605

606+
# Check whether any PDL files changed on this branch also changed on the
607+
# base branch since divergence. If so, the branch must be rebased or
608+
# merged before schemaVersion can be bumped correctly.
609+
# Unrelated PDL changes on the base branch do not block.
610+
base_pdl_changes = get_base_pdl_changes(merge_base, remote_ref)
611+
conflicting = sorted(set(directly_changed) & set(base_pdl_changes))
612+
if conflicting:
613+
files_list = "\n".join(f" {f}" for f in conflicting)
614+
print(
615+
f"ERROR: The following PDL file(s) also changed on {args.base_branch} "
616+
f"since this branch diverged. Please merge or rebase from "
617+
f"{args.base_branch} first:\n{files_list}",
618+
file=sys.stderr,
619+
)
620+
return 1
621+
553622
if args.verbose:
554-
print(f"Comparing against branch: {args.base_branch}")
623+
print(f"Comparing against branch: {args.base_branch} (merge-base: {merge_base[:12]})")
555624
print(f"Found {len(directly_changed)} directly changed PDL file(s):")
556625
for f in directly_changed:
557626
print(f" {f}")
@@ -581,7 +650,9 @@ def main() -> int:
581650
path = Path(filepath)
582651
current_content = path.read_text(encoding="utf-8")
583652

584-
base_content = get_file_at_branch(filepath, args.base_branch)
653+
# Use remote_ref (refs/remotes/origin/…) rather than the bare branch
654+
# name — local checkouts of the base branch may not exist.
655+
base_content = get_file_at_branch(filepath, remote_ref)
585656
base_version = get_schema_version(base_content) if base_content else 0
586657
current_version = get_schema_version(current_content)
587658
new_version = base_version + 1

.github/scripts/test/test_bump_schema_versions.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -629,14 +629,15 @@ def test_unrelated_same_namespace_record_is_not_a_dependency(tmp_path, monkeypat
629629

630630

631631
# ---------------------------------------------------------------------------
632-
# detect_default_branch
632+
# detect_default_branch / get_merge_base / get_base_pdl_changes
633633
# ---------------------------------------------------------------------------
634634

635635

636-
def _make_proc(returncode: int, stdout: str = "") -> MagicMock:
636+
def _make_proc(returncode: int, stdout: str = "", stderr: str = "") -> MagicMock:
637637
m = MagicMock()
638638
m.returncode = returncode
639639
m.stdout = stdout
640+
m.stderr = stderr
640641
return m
641642

642643

@@ -658,6 +659,34 @@ def test_detect_default_branch_fallback_to_master(monkeypatch):
658659
assert bsv.detect_default_branch() == "master"
659660

660661

662+
def test_get_merge_base_returns_sha(monkeypatch):
663+
monkeypatch.setattr(subprocess, "run",
664+
lambda *a, **kw: _make_proc(0, "abc123def456\n"))
665+
assert bsv.get_merge_base("refs/remotes/origin/acryl-main") == "abc123def456"
666+
667+
668+
def test_get_merge_base_failure_exits(monkeypatch):
669+
monkeypatch.setattr(subprocess, "run",
670+
lambda *a, **kw: _make_proc(1, stderr="not a git repo"))
671+
with pytest.raises(SystemExit):
672+
bsv.get_merge_base("refs/remotes/origin/acryl-main")
673+
674+
675+
def test_get_base_pdl_changes_returns_files(monkeypatch):
676+
monkeypatch.setattr(subprocess, "run",
677+
lambda *a, **kw: _make_proc(0, "a/Foo.pdl\nb/Bar.pdl\n"))
678+
result = bsv.get_base_pdl_changes("deadbeef", "refs/remotes/origin/acryl-main")
679+
assert result == ["a/Foo.pdl", "b/Bar.pdl"]
680+
681+
682+
def test_get_base_pdl_changes_failure_exits(monkeypatch):
683+
monkeypatch.setattr(subprocess, "run",
684+
lambda *a, **kw: (_ for _ in ()).throw(
685+
subprocess.CalledProcessError(1, "git", stderr="err")))
686+
with pytest.raises(SystemExit):
687+
bsv.get_base_pdl_changes("deadbeef", "refs/remotes/origin/acryl-main")
688+
689+
661690
# ---------------------------------------------------------------------------
662691
# Version bump scenarios (via main())
663692
#
@@ -668,17 +697,23 @@ def test_detect_default_branch_fallback_to_master(monkeypatch):
668697
# ---------------------------------------------------------------------------
669698

670699

671-
def _run_main(tmp_path, monkeypatch, changed_files, base_content_by_path):
700+
def _run_main(tmp_path, monkeypatch, changed_files, base_content_by_path, *,
701+
base_pdl_changes=None):
672702
"""
673703
Run main() with filesystem and git calls mocked out.
674704
675-
changed_files — list of Path objects treated as "changed"
705+
changed_files — list of Path objects treated as "changed" on this branch
676706
base_content_by_path — dict mapping str(path) → content string on base branch,
677707
or None to simulate a new file not present on base
708+
base_pdl_changes — list of str paths changed on the base branch since
709+
divergence (default: empty — no conflict)
678710
"""
679711
monkeypatch.setenv("PDL_ROOTS", str(tmp_path))
680712
monkeypatch.setattr(sys, "argv", ["bump_schema_versions.py", "--base-branch", "master"])
681-
monkeypatch.setattr(bsv, "get_changed_pdl_files", lambda _branch: [str(f) for f in changed_files])
713+
monkeypatch.setattr(bsv, "get_merge_base", lambda _ref: "deadbeef")
714+
monkeypatch.setattr(bsv, "get_base_pdl_changes",
715+
lambda _base, _ref: base_pdl_changes or [])
716+
monkeypatch.setattr(bsv, "get_changed_pdl_files", lambda _ref: [str(f) for f in changed_files])
682717
monkeypatch.setattr(bsv, "get_file_at_branch",
683718
lambda path, _branch: base_content_by_path.get(path))
684719
return bsv.main()
@@ -730,14 +765,48 @@ def test_version_bump_new_file_stays_at_1(tmp_path, monkeypatch):
730765
assert bsv.get_schema_version(aspect.read_text()) == 1
731766

732767

768+
def test_conflicting_pdl_on_base_branch_exits_with_error(tmp_path, monkeypatch):
769+
# Base ref is current AND the same PDL changed on both branches → block at stage 2
770+
base_content = aspect_pdl("myAspect")
771+
aspect = write_pdl(tmp_path, "com/linkedin/dataset/MyAspect.pdl", base_content)
772+
773+
rc = _run_main(
774+
tmp_path, monkeypatch,
775+
changed_files=[aspect],
776+
base_content_by_path={str(aspect): base_content},
777+
base_pdl_changes=[str(aspect)], # same file changed on base
778+
)
779+
780+
assert rc == 1
781+
assert bsv.get_schema_version(aspect.read_text()) == 1 # untouched
782+
783+
784+
def test_unrelated_base_pdl_changes_do_not_block(tmp_path, monkeypatch):
785+
# A different PDL changed on the base branch — should not block this branch's bump
786+
base_content = aspect_pdl("myAspect")
787+
aspect = write_pdl(tmp_path, "com/linkedin/dataset/MyAspect.pdl", base_content)
788+
789+
rc = _run_main(
790+
tmp_path, monkeypatch,
791+
changed_files=[aspect],
792+
base_content_by_path={str(aspect): base_content},
793+
base_pdl_changes=["com/linkedin/other/Unrelated.pdl"], # different file on base
794+
)
795+
796+
assert rc == 0
797+
assert bsv.get_schema_version(aspect.read_text()) == 2 # bumped normally
798+
799+
733800
def test_version_bump_dry_run_does_not_write(tmp_path, monkeypatch):
734801
base_content = aspect_pdl("myAspect")
735802
aspect = write_pdl(tmp_path, "com/linkedin/dataset/MyAspect.pdl", base_content)
736803
original = aspect.read_text()
737804

738805
monkeypatch.setenv("PDL_ROOTS", str(tmp_path))
739806
monkeypatch.setattr(sys, "argv", ["bump_schema_versions.py", "--base-branch", "master", "--dry-run"])
740-
monkeypatch.setattr(bsv, "get_changed_pdl_files", lambda _branch: [str(aspect)])
807+
monkeypatch.setattr(bsv, "get_merge_base", lambda _ref: "deadbeef")
808+
monkeypatch.setattr(bsv, "get_base_pdl_changes", lambda _base, _ref: [])
809+
monkeypatch.setattr(bsv, "get_changed_pdl_files", lambda _ref: [str(aspect)])
741810
monkeypatch.setattr(bsv, "get_file_at_branch", lambda path, _branch: base_content)
742811

743812
rc = bsv.main()

0 commit comments

Comments
 (0)