Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 98 additions & 12 deletions readthedocs/filetreediff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import json

import structlog

from readthedocs.builds.models import Build
from readthedocs.builds.models import Version
from readthedocs.filetreediff.dataclasses import FileTreeDiff
Expand All @@ -27,7 +29,10 @@
from readthedocs.storage import build_media_storage


log = structlog.get_logger(__name__)

MANIFEST_FILE_NAME = "manifest.json"
BASE_MANIFEST_SNAPSHOT_FILE_NAME = "base_manifest_snapshot.json"


def get_diff(current_version: Version, base_version: Version) -> FileTreeDiff | None:
Expand All @@ -43,24 +48,43 @@ def get_diff(current_version: Version, base_version: Version) -> FileTreeDiff |
To get the modified files, we compare the main content hash of each common file.
If there are no changes between the versions, all lists will be empty.
"""
outdated = False
manifests: list[FileTreeDiffManifest] = []
for version in (current_version, base_version):
manifest = get_manifest(version)
if not manifest:
current_version_manifest = get_manifest(current_version)
if not current_version_manifest:
return None

current_latest_build = current_version.latest_successful_build
if not current_latest_build:
return None

outdated = current_latest_build.id != current_version_manifest.build.id

# For external versions (PRs), prefer the snapshotted base manifest
# (pinned at first PR build) over the live base manifest. This prevents
# false positives when the base branch has moved forward since the PR was
# created. Falls back to the live manifest if no snapshot exists.
# TODO: call clear_base_manifest_snapshot() on PR rebase/synchronize
# events so the snapshot is refreshed against the new base.
#
# When a snapshot is used, we intentionally skip the base-side `outdated`
# check. The snapshot's build will be older than the base version's latest
# build (the base branch kept moving) — that's the whole point. Marking
# the diff as outdated here would defeat the snapshot's purpose.
base_version_manifest = None
if current_version.is_external:
base_version_manifest = _get_base_manifest_snapshot(current_version)

if not base_version_manifest:
base_version_manifest = get_manifest(base_version)
if not base_version_manifest:
return None

latest_build = version.latest_successful_build
if not latest_build:
base_latest_build = base_version.latest_successful_build
if not base_latest_build:
return None

if latest_build.id != manifest.build.id:
if base_latest_build.id != base_version_manifest.build.id:
outdated = True

manifests.append(manifest)

# pylint: disable=unbalanced-tuple-unpacking
current_version_manifest, base_version_manifest = manifests
current_version_file_paths = set(current_version_manifest.files.keys())
base_version_file_paths = set(base_version_manifest.files.keys())

Expand Down Expand Up @@ -116,3 +140,65 @@ def write_manifest(version: Version, manifest: FileTreeDiffManifest):
)
with build_media_storage.open(manifest_path, "w") as f:
json.dump(manifest.as_dict(), f)


def _get_base_manifest_snapshot(
external_version: Version,
) -> FileTreeDiffManifest | None:
"""Get the snapshotted base manifest for an external version, or None."""
snapshot_path = external_version.get_storage_path(
media_type=MEDIA_TYPE_DIFF,
filename=BASE_MANIFEST_SNAPSHOT_FILE_NAME,
)
try:
with build_media_storage.open(snapshot_path) as f:
data = json.load(f)
except FileNotFoundError:
return None

return FileTreeDiffManifest.from_dict(data)


def snapshot_base_manifest(external_version: Version, base_version: Version):
"""
Snapshot the base version's current manifest for a PR version.

Only writes if no snapshot exists yet (first build of the PR).

This stores a full copy of the base manifest per PR. If many PRs branch
from the same base commit, the same content is duplicated. This is a
deliberate tradeoff: manifests are small (a few KB–hundreds of KB), and
the full-copy approach means cleanup is free (deleting the PR's storage
directory removes the snapshot too — no reference counting needed).

A future optimization could store manifests keyed by commit hash
(``manifests/{commit}.json``) and have the snapshot be a pointer. This
would dedup storage but requires a GC task to clean up unreferenced
manifests. Same approach could extend to all build artifacts if we move
toward content-addressed storage.
"""
snapshot_path = external_version.get_storage_path(
media_type=MEDIA_TYPE_DIFF,
filename=BASE_MANIFEST_SNAPSHOT_FILE_NAME,
)
if build_media_storage.exists(snapshot_path):
return

base_manifest = get_manifest(base_version)
if not base_manifest:
return

with build_media_storage.open(snapshot_path, "w") as f:
json.dump(base_manifest.as_dict(), f)

log.info(
"Base manifest snapshot created.",
project_slug=external_version.project.slug,
external_version_slug=external_version.slug,
base_version_slug=base_version.slug,
base_build_id=base_manifest.build.id,
)
# TODO: add a clear_base_manifest_snapshot() helper and call it on PR
# rebase/synchronize webhook events so the snapshot refreshes when the
# PR is rebased against a newer base.
# See https://github.com/readthedocs/readthedocs.org/issues/12232
152 changes: 122 additions & 30 deletions readthedocs/filetreediff/tests/test_filetreediff.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,37 @@
from django.test import TestCase
from django_dynamic_fixture import get

from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST
from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST
from readthedocs.builds.models import Build, Version
from readthedocs.filetreediff import get_diff
from readthedocs.filetreediff import get_diff, snapshot_base_manifest
from readthedocs.projects.models import Project
from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest


def _mock_open(content):
@contextmanager
def f(*args, **kwargs):
read_mock = mock.MagicMock()
read_mock.read.return_value = content
yield read_mock

return f


def _mock_manifest(build_id: int, files: dict[str, str]):
return _mock_open(
json.dumps(
{
"build": {"id": build_id},
"files": {
path: {"main_content_hash": content_hash}
for path, content_hash in files.items()
},
}
)
)


# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
# since the setting is evaluated just once (first test to use the storage
# backend will set it for the whole test suite).
Expand Down Expand Up @@ -59,37 +83,15 @@ def setUp(self):
success=True,
)

def _mock_open(self, content):
@contextmanager
def f(*args, **kwargs):
read_mock = mock.MagicMock()
read_mock.read.return_value = content
yield read_mock

return f

def _mock_manifest(self, build_id: int, files: dict[str, str]):
return self._mock_open(
json.dumps(
{
"build": {"id": build_id},
"files": {
file_path: {"main_content_hash": main_content_hash}
for file_path, main_content_hash in files.items()
},
}
)
)

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_diff_no_changes(self, storage_open):
files_a = {
"index.html": "hash1",
"tutorials/index.html": "hash2",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a.id, files_a)(),
self._mock_manifest(self.build_b.id, files_a)(),
_mock_manifest(self.build_a.id, files_a)(),
_mock_manifest(self.build_b.id, files_a)(),
]
diff = get_diff(self.version_a, self.version_b)
assert diff.added == []
Expand All @@ -110,8 +112,8 @@ def test_diff_changes(self, storage_open):
"deleted.html": "hash-deleted",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a.id, files_a)(),
self._mock_manifest(self.build_b.id, files_b)(),
_mock_manifest(self.build_a.id, files_a)(),
_mock_manifest(self.build_b.id, files_b)(),
]
diff = get_diff(self.version_a, self.version_b)
assert [file.path for file in diff.files] == ["deleted.html", "new-file.html", "tutorials/index.html"]
Expand Down Expand Up @@ -139,12 +141,102 @@ def test_outdated_diff(self, storage_open):
"deleted.html": "hash-deleted",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a_old.id, files_a)(),
self._mock_manifest(self.build_b_old.id, files_b)(),
_mock_manifest(self.build_a_old.id, files_a)(),
_mock_manifest(self.build_b_old.id, files_b)(),
]
diff = get_diff(self.version_a, self.version_b)
assert [file.path for file in diff.files] == ["deleted.html", "new-file.html", "tutorials/index.html"]
assert [file.path for file in diff.added] == ["new-file.html"]
assert [file.path for file in diff.deleted] == ["deleted.html"]
assert [file.path for file in diff.modified] == ["tutorials/index.html"]
assert diff.outdated


@mock.patch(
"readthedocs.filetreediff.build_media_storage",
new=BuildMediaFileSystemStorageTest(),
)
class TestsBaseManifestSnapshot(TestCase):
"""Tests for base manifest snapshotting (stale branch fix)."""

def setUp(self):
self.project = get(Project)
self.base_version = self.project.versions.get(slug=LATEST)
self.base_build = get(
Build,
project=self.project,
version=self.base_version,
state=BUILD_STATE_FINISHED,
success=True,
)
self.pr_version = get(
Version,
project=self.project,
slug="pr-42",
verbose_name="42",
type=EXTERNAL,
active=True,
built=True,
)
self.pr_build = get(
Build,
project=self.project,
version=self.pr_version,
state=BUILD_STATE_FINISHED,
success=True,
)

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_snapshot_used_over_live_base(self, storage_open):
"""For PRs, get_diff uses the snapshot instead of the live base manifest."""
pr_files = {"index.html": "pr-hash", "new-page.html": "new-hash"}
snapshot_files = {"index.html": "original-hash"}

# get_diff reads: 1) PR manifest, 2) base snapshot
storage_open.side_effect = [
_mock_manifest(self.pr_build.id, pr_files)(),
_mock_manifest(self.base_build.id, snapshot_files)(),
]
diff = get_diff(self.pr_version, self.base_version)
assert [f.path for f in diff.added] == ["new-page.html"]
assert [f.path for f in diff.modified] == ["index.html"]
assert diff.deleted == []

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_fallback_to_live_base_when_no_snapshot(self, storage_open):
pr_files = {"index.html": "pr-hash"}
live_base_files = {"index.html": "live-hash"}

# 1) PR manifest, 2) snapshot miss, 3) live base manifest
storage_open.side_effect = [
_mock_manifest(self.pr_build.id, pr_files)(),
FileNotFoundError,
_mock_manifest(self.base_build.id, live_base_files)(),
]
diff = get_diff(self.pr_version, self.base_version)
assert [f.path for f in diff.modified] == ["index.html"]

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_snapshot_prevents_false_positives_from_stale_base(self, storage_open):
"""
PR only changed index.html. Meanwhile base added extra.html.
Without snapshot the diff would show extra.html as deleted — bogus.
"""
pr_files = {"index.html": "pr-hash", "about.html": "same-hash"}
snapshot_files = {"index.html": "original-hash", "about.html": "same-hash"}

storage_open.side_effect = [
_mock_manifest(self.pr_build.id, pr_files)(),
_mock_manifest(self.base_build.id, snapshot_files)(),
]
diff = get_diff(self.pr_version, self.base_version)
assert [f.path for f in diff.modified] == ["index.html"]
assert diff.added == []
assert diff.deleted == []

@mock.patch.object(BuildMediaFileSystemStorageTest, "exists", return_value=True)
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_snapshot_is_write_once(self, storage_open, storage_exists):
"""snapshot_base_manifest is a no-op if a snapshot already exists."""
snapshot_base_manifest(self.pr_version, self.base_version)
storage_open.assert_not_called()
20 changes: 18 additions & 2 deletions readthedocs/projects/tasks/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from readthedocs.builds.models import Build
from readthedocs.builds.models import Version
from readthedocs.builds.tasks import post_build_overview
from readthedocs.filetreediff import snapshot_base_manifest
from readthedocs.filetreediff import write_manifest
from readthedocs.filetreediff.dataclasses import FileTreeDiffManifest
from readthedocs.filetreediff.dataclasses import FileTreeDiffManifestFile
Expand Down Expand Up @@ -143,11 +144,26 @@ def collect(self, sync_id: int):
manifest = FileTreeDiffManifest(
build_id=self.build.id,
files=[
FileTreeDiffManifestFile(path=path, main_content_hash=hash)
for path, hash in self._hashes.items()
FileTreeDiffManifestFile(path=path, main_content_hash=content_hash)
for path, content_hash in self._hashes.items()
],
)
write_manifest(self.version, manifest)

# For PR previews, snapshot the base version's manifest on the first
# PR build where the snapshot can be created.
# This pins the diff baseline so that subsequent builds compare against
# that snapshotted base-version state instead of the base version's
# current state. This prevents false file changes when the base branch
# moves forward (the "stale branch" problem).
if self.version.is_external:
base_version = (
self.version.project.addons.options_base_version
or self.version.project.get_latest_version()
)
if base_version:
snapshot_base_manifest(self.version, base_version)

if (
self.post_build_overview
and self.version.is_external
Expand Down
Loading