diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index b442bd17b29..80aa2e451ff 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -18,6 +18,8 @@ import json +import structlog + from readthedocs.builds.models import Build from readthedocs.builds.models import Version from readthedocs.filetreediff.dataclasses import FileTreeDiff @@ -27,7 +29,10 @@ from readthedocs.storage import build_media_storage +log = structlog.get_logger(__name__) + MANIFEST_FILE_NAME = "manifest.json" +BASE_MANIFEST_SNAPSHOT_FILE_NAME = "base_manifest_snapshot.json" def get_diff(current_version: Version, base_version: Version) -> FileTreeDiff | None: @@ -43,24 +48,43 @@ def get_diff(current_version: Version, base_version: Version) -> FileTreeDiff | To get the modified files, we compare the main content hash of each common file. If there are no changes between the versions, all lists will be empty. """ - outdated = False - manifests: list[FileTreeDiffManifest] = [] - for version in (current_version, base_version): - manifest = get_manifest(version) - if not manifest: + current_version_manifest = get_manifest(current_version) + if not current_version_manifest: + return None + + current_latest_build = current_version.latest_successful_build + if not current_latest_build: + return None + + outdated = current_latest_build.id != current_version_manifest.build.id + + # For external versions (PRs), prefer the snapshotted base manifest + # (pinned at first PR build) over the live base manifest. This prevents + # false positives when the base branch has moved forward since the PR was + # created. Falls back to the live manifest if no snapshot exists. + # TODO: call clear_base_manifest_snapshot() on PR rebase/synchronize + # events so the snapshot is refreshed against the new base. + # + # When a snapshot is used, we intentionally skip the base-side `outdated` + # check. The snapshot's build will be older than the base version's latest + # build (the base branch kept moving) — that's the whole point. Marking + # the diff as outdated here would defeat the snapshot's purpose. + base_version_manifest = None + if current_version.is_external: + base_version_manifest = _get_base_manifest_snapshot(current_version) + + if not base_version_manifest: + base_version_manifest = get_manifest(base_version) + if not base_version_manifest: return None - latest_build = version.latest_successful_build - if not latest_build: + base_latest_build = base_version.latest_successful_build + if not base_latest_build: return None - if latest_build.id != manifest.build.id: + if base_latest_build.id != base_version_manifest.build.id: outdated = True - manifests.append(manifest) - - # pylint: disable=unbalanced-tuple-unpacking - current_version_manifest, base_version_manifest = manifests current_version_file_paths = set(current_version_manifest.files.keys()) base_version_file_paths = set(base_version_manifest.files.keys()) @@ -116,3 +140,65 @@ def write_manifest(version: Version, manifest: FileTreeDiffManifest): ) with build_media_storage.open(manifest_path, "w") as f: json.dump(manifest.as_dict(), f) + + +def _get_base_manifest_snapshot( + external_version: Version, +) -> FileTreeDiffManifest | None: + """Get the snapshotted base manifest for an external version, or None.""" + snapshot_path = external_version.get_storage_path( + media_type=MEDIA_TYPE_DIFF, + filename=BASE_MANIFEST_SNAPSHOT_FILE_NAME, + ) + try: + with build_media_storage.open(snapshot_path) as f: + data = json.load(f) + except FileNotFoundError: + return None + + return FileTreeDiffManifest.from_dict(data) + + +def snapshot_base_manifest(external_version: Version, base_version: Version): + """ + Snapshot the base version's current manifest for a PR version. + + Only writes if no snapshot exists yet (first build of the PR). + + This stores a full copy of the base manifest per PR. If many PRs branch + from the same base commit, the same content is duplicated. This is a + deliberate tradeoff: manifests are small (a few KB–hundreds of KB), and + the full-copy approach means cleanup is free (deleting the PR's storage + directory removes the snapshot too — no reference counting needed). + + A future optimization could store manifests keyed by commit hash + (``manifests/{commit}.json``) and have the snapshot be a pointer. This + would dedup storage but requires a GC task to clean up unreferenced + manifests. Same approach could extend to all build artifacts if we move + toward content-addressed storage. + """ + snapshot_path = external_version.get_storage_path( + media_type=MEDIA_TYPE_DIFF, + filename=BASE_MANIFEST_SNAPSHOT_FILE_NAME, + ) + if build_media_storage.exists(snapshot_path): + return + + base_manifest = get_manifest(base_version) + if not base_manifest: + return + + with build_media_storage.open(snapshot_path, "w") as f: + json.dump(base_manifest.as_dict(), f) + + log.info( + "Base manifest snapshot created.", + project_slug=external_version.project.slug, + external_version_slug=external_version.slug, + base_version_slug=base_version.slug, + base_build_id=base_manifest.build.id, + ) + # TODO: add a clear_base_manifest_snapshot() helper and call it on PR + # rebase/synchronize webhook events so the snapshot refreshes when the + # PR is rebased against a newer base. + # See https://github.com/readthedocs/readthedocs.org/issues/12232 diff --git a/readthedocs/filetreediff/tests/test_filetreediff.py b/readthedocs/filetreediff/tests/test_filetreediff.py index 8d19d3ee2e3..449d3911881 100644 --- a/readthedocs/filetreediff/tests/test_filetreediff.py +++ b/readthedocs/filetreediff/tests/test_filetreediff.py @@ -5,13 +5,37 @@ from django.test import TestCase from django_dynamic_fixture import get -from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.filetreediff import get_diff +from readthedocs.filetreediff import get_diff, snapshot_base_manifest from readthedocs.projects.models import Project from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest +def _mock_open(content): + @contextmanager + def f(*args, **kwargs): + read_mock = mock.MagicMock() + read_mock.read.return_value = content + yield read_mock + + return f + + +def _mock_manifest(build_id: int, files: dict[str, str]): + return _mock_open( + json.dumps( + { + "build": {"id": build_id}, + "files": { + path: {"main_content_hash": content_hash} + for path, content_hash in files.items() + }, + } + ) + ) + + # We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE, # since the setting is evaluated just once (first test to use the storage # backend will set it for the whole test suite). @@ -59,28 +83,6 @@ def setUp(self): success=True, ) - def _mock_open(self, content): - @contextmanager - def f(*args, **kwargs): - read_mock = mock.MagicMock() - read_mock.read.return_value = content - yield read_mock - - return f - - def _mock_manifest(self, build_id: int, files: dict[str, str]): - return self._mock_open( - json.dumps( - { - "build": {"id": build_id}, - "files": { - file_path: {"main_content_hash": main_content_hash} - for file_path, main_content_hash in files.items() - }, - } - ) - ) - @mock.patch.object(BuildMediaFileSystemStorageTest, "open") def test_diff_no_changes(self, storage_open): files_a = { @@ -88,8 +90,8 @@ def test_diff_no_changes(self, storage_open): "tutorials/index.html": "hash2", } storage_open.side_effect = [ - self._mock_manifest(self.build_a.id, files_a)(), - self._mock_manifest(self.build_b.id, files_a)(), + _mock_manifest(self.build_a.id, files_a)(), + _mock_manifest(self.build_b.id, files_a)(), ] diff = get_diff(self.version_a, self.version_b) assert diff.added == [] @@ -110,8 +112,8 @@ def test_diff_changes(self, storage_open): "deleted.html": "hash-deleted", } storage_open.side_effect = [ - self._mock_manifest(self.build_a.id, files_a)(), - self._mock_manifest(self.build_b.id, files_b)(), + _mock_manifest(self.build_a.id, files_a)(), + _mock_manifest(self.build_b.id, files_b)(), ] diff = get_diff(self.version_a, self.version_b) assert [file.path for file in diff.files] == ["deleted.html", "new-file.html", "tutorials/index.html"] @@ -139,8 +141,8 @@ def test_outdated_diff(self, storage_open): "deleted.html": "hash-deleted", } storage_open.side_effect = [ - self._mock_manifest(self.build_a_old.id, files_a)(), - self._mock_manifest(self.build_b_old.id, files_b)(), + _mock_manifest(self.build_a_old.id, files_a)(), + _mock_manifest(self.build_b_old.id, files_b)(), ] diff = get_diff(self.version_a, self.version_b) assert [file.path for file in diff.files] == ["deleted.html", "new-file.html", "tutorials/index.html"] @@ -148,3 +150,93 @@ def test_outdated_diff(self, storage_open): assert [file.path for file in diff.deleted] == ["deleted.html"] assert [file.path for file in diff.modified] == ["tutorials/index.html"] assert diff.outdated + + +@mock.patch( + "readthedocs.filetreediff.build_media_storage", + new=BuildMediaFileSystemStorageTest(), +) +class TestsBaseManifestSnapshot(TestCase): + """Tests for base manifest snapshotting (stale branch fix).""" + + def setUp(self): + self.project = get(Project) + self.base_version = self.project.versions.get(slug=LATEST) + self.base_build = get( + Build, + project=self.project, + version=self.base_version, + state=BUILD_STATE_FINISHED, + success=True, + ) + self.pr_version = get( + Version, + project=self.project, + slug="pr-42", + verbose_name="42", + type=EXTERNAL, + active=True, + built=True, + ) + self.pr_build = get( + Build, + project=self.project, + version=self.pr_version, + state=BUILD_STATE_FINISHED, + success=True, + ) + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_snapshot_used_over_live_base(self, storage_open): + """For PRs, get_diff uses the snapshot instead of the live base manifest.""" + pr_files = {"index.html": "pr-hash", "new-page.html": "new-hash"} + snapshot_files = {"index.html": "original-hash"} + + # get_diff reads: 1) PR manifest, 2) base snapshot + storage_open.side_effect = [ + _mock_manifest(self.pr_build.id, pr_files)(), + _mock_manifest(self.base_build.id, snapshot_files)(), + ] + diff = get_diff(self.pr_version, self.base_version) + assert [f.path for f in diff.added] == ["new-page.html"] + assert [f.path for f in diff.modified] == ["index.html"] + assert diff.deleted == [] + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_fallback_to_live_base_when_no_snapshot(self, storage_open): + pr_files = {"index.html": "pr-hash"} + live_base_files = {"index.html": "live-hash"} + + # 1) PR manifest, 2) snapshot miss, 3) live base manifest + storage_open.side_effect = [ + _mock_manifest(self.pr_build.id, pr_files)(), + FileNotFoundError, + _mock_manifest(self.base_build.id, live_base_files)(), + ] + diff = get_diff(self.pr_version, self.base_version) + assert [f.path for f in diff.modified] == ["index.html"] + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_snapshot_prevents_false_positives_from_stale_base(self, storage_open): + """ + PR only changed index.html. Meanwhile base added extra.html. + Without snapshot the diff would show extra.html as deleted — bogus. + """ + pr_files = {"index.html": "pr-hash", "about.html": "same-hash"} + snapshot_files = {"index.html": "original-hash", "about.html": "same-hash"} + + storage_open.side_effect = [ + _mock_manifest(self.pr_build.id, pr_files)(), + _mock_manifest(self.base_build.id, snapshot_files)(), + ] + diff = get_diff(self.pr_version, self.base_version) + assert [f.path for f in diff.modified] == ["index.html"] + assert diff.added == [] + assert diff.deleted == [] + + @mock.patch.object(BuildMediaFileSystemStorageTest, "exists", return_value=True) + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_snapshot_is_write_once(self, storage_open, storage_exists): + """snapshot_base_manifest is a no-op if a snapshot already exists.""" + snapshot_base_manifest(self.pr_version, self.base_version) + storage_open.assert_not_called() diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index cb76bb4c127..82fd69ae364 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -9,6 +9,7 @@ from readthedocs.builds.models import Build from readthedocs.builds.models import Version from readthedocs.builds.tasks import post_build_overview +from readthedocs.filetreediff import snapshot_base_manifest from readthedocs.filetreediff import write_manifest from readthedocs.filetreediff.dataclasses import FileTreeDiffManifest from readthedocs.filetreediff.dataclasses import FileTreeDiffManifestFile @@ -143,11 +144,26 @@ def collect(self, sync_id: int): manifest = FileTreeDiffManifest( build_id=self.build.id, files=[ - FileTreeDiffManifestFile(path=path, main_content_hash=hash) - for path, hash in self._hashes.items() + FileTreeDiffManifestFile(path=path, main_content_hash=content_hash) + for path, content_hash in self._hashes.items() ], ) write_manifest(self.version, manifest) + + # For PR previews, snapshot the base version's manifest on the first + # PR build where the snapshot can be created. + # This pins the diff baseline so that subsequent builds compare against + # that snapshotted base-version state instead of the base version's + # current state. This prevents false file changes when the base branch + # moves forward (the "stale branch" problem). + if self.version.is_external: + base_version = ( + self.version.project.addons.options_base_version + or self.version.project.get_latest_version() + ) + if base_version: + snapshot_base_manifest(self.version, base_version) + if ( self.post_build_overview and self.version.is_external