diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index 30bde8f5162..9e5c98ddfcf 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -56,6 +56,7 @@ from readthedocs.notifications.models import Notification from readthedocs.projects.constants import BITBUCKET_COMMIT_URL from readthedocs.projects.constants import DOCTYPE_CHOICES +from readthedocs.projects.constants import DOWNLOADABLE_MEDIA_TYPES from readthedocs.projects.constants import GITHUB_COMMIT_URL from readthedocs.projects.constants import GITHUB_PULL_REQUEST_COMMIT_URL from readthedocs.projects.constants import GITLAB_COMMIT_URL @@ -73,6 +74,7 @@ from readthedocs.projects.ordering import ProjectItemPositionManager from readthedocs.projects.validators import validate_build_config_file from readthedocs.projects.version_handling import determine_stable_version +from readthedocs.storage import build_media_storage log = structlog.get_logger(__name__) @@ -542,20 +544,58 @@ def get_storage_paths(self, version_slug=None): sometimes to clean old resources. :rtype: list """ - paths = [] - - slug = version_slug or self.slug - for type_ in MEDIA_TYPES: - paths.append( - self.project.get_storage_path( - type_=type_, - version_slug=slug, - include_file=False, - version_type=self.type, - ) - ) + return [ + self.get_storage_path(media_type=media_type, version_slug=version_slug) + for media_type in MEDIA_TYPES + ] + + def get_storage_path(self, media_type, filename=None, version_slug=None): + """ + Get a path in storage for a given media type and filename for this version. + + :param media_type: The type of media (e.g. "pdf", "epub", "htmlzip"). + :param filename: Optional filename to append to the path. + If not provided, the directory path for the media type will be returned. + :param version_slug: Override the version slug to use in the path. + This is useful when the version slug has changed but we need to access old resources. + """ + if media_type not in MEDIA_TYPES: + raise ValueError("Invalid type.") + + version_slug = version_slug or self.slug + + path = media_type + if self.is_external: + path = f"{EXTERNAL}/{media_type}" + + # Version slug may come from an untrusted input, + # so we use join to avoid any path traversal. + # All other values are already validated. + path = build_media_storage.join(f"{path}/{self.project.slug}", version_slug) + + # If the filename starts with `/`, the join will fail, + # so we strip it before joining it. + filename = (filename or "").lstrip("/") + if not filename: + return path + + return build_media_storage.join(path, filename) + + def get_download_storage_path(self, media_type): + """ + Get the storage path for a downloadable artifact of this version. + + This is basically a shortcut to `get_storage_path` that also adds the + filename based on the version slug and media type. + + :param media_type: The type of media (e.g. "pdf", "epub", "htmlzip"). + """ + if media_type not in DOWNLOADABLE_MEDIA_TYPES: + raise ValueError("Invalid type for downloadable file.") - return paths + extension = media_type.replace("htmlzip", "zip") + filename = f"{self.project.slug}.{extension}" + return self.get_storage_path(media_type=media_type, filename=filename) class APIVersion(Version): diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py index 69c6e95d992..9abc7ce5810 100644 --- a/readthedocs/embed/v3/views.py +++ b/readthedocs/embed/v3/views.py @@ -21,6 +21,7 @@ from readthedocs.api.v3.permissions import HasEmbedAPIAccess from readthedocs.core.utils.extend import SettingsOverrideObject from readthedocs.embed.utils import clean_references +from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.storage import build_media_storage @@ -92,32 +93,19 @@ def _download_page_content(self, url): ) return response.content - def _get_page_content_from_storage(self, project, version, filename): - storage_path = project.get_storage_path( - "html", - version_slug=version.slug, - include_file=False, - version_type=version.type, - ) - + def _get_page_content_from_storage(self, version, filename): # Decode encoded URLs (e.g. convert %20 into a whitespace) filename = urllib.parse.unquote(filename) - - # If the filename starts with `/`, the join will fail, - # so we strip it before joining it. - relative_filename = filename.lstrip("/") - file_path = build_media_storage.join( - storage_path, - relative_filename, - ) - - tryfiles = [file_path, build_media_storage.join(file_path, "index.html")] + tryfiles = [filename, build_media_storage.join(filename, "index.html")] for tryfile in tryfiles: + storage_file_path = version.get_storage_path( + media_type=MEDIA_TYPE_HTML, filename=tryfile + ) try: - with build_media_storage.open(tryfile) as fd: + with build_media_storage.open(storage_file_path) as fd: return fd.read() except Exception: # noqa - log.warning("Unable to read file.", file_path=file_path) + log.warning("Unable to read file.", file_path=storage_file_path) return None @@ -132,10 +120,9 @@ def _get_content_by_fragment( if self.external: page_content = self._download_page_content(url) else: - project = self.unresolved_url.project version = self.unresolved_url.version filename = self.unresolved_url.filename - page_content = self._get_page_content_from_storage(project, version, filename) + page_content = self._get_page_content_from_storage(version, filename) return self._parse_based_on_doctool( page_content, diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 64ea96f745d..b442bd17b29 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -96,13 +96,10 @@ def get_manifest(version: Version) -> FileTreeDiffManifest | None: If the manifest file does not exist, return None. """ - storage_path = version.project.get_storage_path( - type_=MEDIA_TYPE_DIFF, - version_slug=version.slug, - include_file=False, - version_type=version.type, + manifest_path = version.get_storage_path( + media_type=MEDIA_TYPE_DIFF, + filename=MANIFEST_FILE_NAME, ) - manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) try: with build_media_storage.open(manifest_path) as manifest_file: manifest = json.load(manifest_file) @@ -113,12 +110,9 @@ def get_manifest(version: Version) -> FileTreeDiffManifest | None: def write_manifest(version: Version, manifest: FileTreeDiffManifest): - storage_path = version.project.get_storage_path( - type_=MEDIA_TYPE_DIFF, - version_slug=version.slug, - include_file=False, - version_type=version.type, + manifest_path = version.get_storage_path( + media_type=MEDIA_TYPE_DIFF, + filename=MANIFEST_FILE_NAME, ) - manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) with build_media_storage.open(manifest_path, "w") as f: json.dump(manifest.as_dict(), f) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 45f4e9d6117..f2f69a835ac 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -60,13 +60,11 @@ from readthedocs.projects.validators import validate_repository_url from readthedocs.projects.version_handling import determine_stable_version from readthedocs.search.parsers import GenericParser -from readthedocs.storage import build_media_storage from readthedocs.vcs_support.backends import backend_cls from .constants import ADDONS_FLYOUT_POSITION_CHOICES from .constants import ADDONS_FLYOUT_SORTING_CHOICES from .constants import ADDONS_FLYOUT_SORTING_SEMVER_READTHEDOCS_COMPATIBLE -from .constants import DOWNLOADABLE_MEDIA_TYPES from .constants import MEDIA_TYPES from .constants import MULTIPLE_VERSIONS_WITH_TRANSLATIONS from .constants import MULTIPLE_VERSIONS_WITHOUT_TRANSLATIONS @@ -784,48 +782,12 @@ def get_storage_paths(self): """ Get the paths of all artifacts used by the project. - :return: the path to an item in storage - (can be used with ``storage.url`` to get the URL). + :return: A list of paths where the project's artifacts are stored. """ storage_paths = [f"{type_}/{self.slug}" for type_ in MEDIA_TYPES] + storage_paths.extend(f"{EXTERNAL}/{type_}/{self.slug}" for type_ in MEDIA_TYPES) return storage_paths - def get_storage_path(self, type_, version_slug=LATEST, include_file=True, version_type=None): - """ - Get a path to a build artifact for use with Django's storage system. - - :param type_: Media content type, ie - 'pdf', 'htmlzip' - :param version_slug: Project version slug for lookup - :param include_file: Include file name in return - :param version_type: Project version type - :return: the path to an item in storage - (can be used with ``storage.url`` to get the URL) - """ - if type_ not in MEDIA_TYPES: - raise ValueError("Invalid content type.") - - if include_file and type_ not in DOWNLOADABLE_MEDIA_TYPES: - raise ValueError("Invalid content type for downloadable file.") - - type_dir = type_ - # Add `external/` prefix for external versions - if version_type == EXTERNAL: - type_dir = f"{EXTERNAL}/{type_}" - - # Version slug may come from an unstrusted input, - # so we use join to avoid any path traversal. - # All other values are already validated. - folder_path = build_media_storage.join(f"{type_dir}/{self.slug}", version_slug) - - if include_file: - extension = type_.replace("htmlzip", "zip") - return "{}/{}.{}".format( - folder_path, - self.slug, - extension, - ) - return folder_path - def get_production_media_url(self, type_, version_slug, resolver=None): """Get the URL for downloading a specific media file.""" # Use project domain for full path --same domain as docs diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index 987481e5d8e..b3bedbdf45d 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -981,13 +981,7 @@ def store_build_artifacts(self): version=self.data.version.slug, type_=media_type, ) - to_path = self.data.project.get_storage_path( - type_=media_type, - version_slug=self.data.version.slug, - include_file=False, - version_type=self.data.version.type, - ) - + to_path = self.data.version.get_storage_path(media_type=media_type) self._log_directory_size(from_path, media_type) try: @@ -1013,12 +1007,7 @@ def store_build_artifacts(self): # Delete formats for media_type in types_to_delete: - media_path = self.data.version.project.get_storage_path( - type_=media_type, - version_slug=self.data.version.slug, - include_file=False, - version_type=self.data.version.type, - ) + media_path = self.data.version.get_storage_path(media_type=media_type) try: build_media_storage.delete_directory(media_path) except Exception as exc: diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 61992cd4703..cb76bb4c127 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -12,6 +12,7 @@ from readthedocs.filetreediff import write_manifest from readthedocs.filetreediff.dataclasses import FileTreeDiffManifest from readthedocs.filetreediff.dataclasses import FileTreeDiffManifestFile +from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.projects.models import HTMLFile from readthedocs.projects.models import Project from readthedocs.projects.signals import files_changed @@ -214,12 +215,7 @@ def _get_indexers( def _process_files(*, version: Version, indexers: list[Indexer]): - storage_path = version.project.get_storage_path( - type_="html", - version_slug=version.slug, - include_file=False, - version_type=version.type, - ) + storage_path = version.get_storage_path(media_type=MEDIA_TYPE_HTML) # A sync ID is a number different than the current `build` attribute (pending rename), # it's used to differentiate the files from the current sync from the previous one. # This is useful to easily delete the previous files from the DB and ES. diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py index 2a59d296eef..7a7d3f0f029 100644 --- a/readthedocs/projects/views/public.py +++ b/readthedocs/projects/views/public.py @@ -333,12 +333,6 @@ def get( is_external = request.unresolved_domain.is_from_external_domain manager = EXTERNAL if is_external else INTERNAL - # Additional protection to force all storage calls - # to use the external or internal versions storage. - # TODO: We already force the manager to match the type, - # so we could probably just remove this. - self.version_type = manager - # It uses the request to get the ``project``. # The rest of arguments come from the URL. project = unresolved_domain.project diff --git a/readthedocs/proxito/tests/test_full.py b/readthedocs/proxito/tests/test_full.py index a0148dd6379..69ba2d863cc 100644 --- a/readthedocs/proxito/tests/test_full.py +++ b/readthedocs/proxito/tests/test_full.py @@ -629,7 +629,8 @@ def test_invalid_download_files(self): self.assertEqual(resp.status_code, 404) @override_settings(PYTHON_MEDIA=False) - def test_download_files_from_external_version(self): + def test_download_external_version_on_main_domain(self): + """Downloading an external version from the main domain should return 404.""" fixture.get( Version, verbose_name="10", @@ -646,7 +647,8 @@ def test_download_files_from_external_version(self): self.assertEqual(resp.status_code, 404) @override_settings(PYTHON_MEDIA=False) - def test_download_files_from_external_version_from_main_domain(self): + def test_download_internal_version_on_external_domain(self): + """Downloading an internal version from an external domain should return 404.""" fixture.get( Version, verbose_name="10", diff --git a/readthedocs/proxito/views/mixins.py b/readthedocs/proxito/views/mixins.py index 5880bd016d9..c15e8b60480 100644 --- a/readthedocs/proxito/views/mixins.py +++ b/readthedocs/proxito/views/mixins.py @@ -15,7 +15,6 @@ from slugify import slugify as unicode_slugify from readthedocs.audit.models import AuditLog -from readthedocs.builds.constants import INTERNAL from readthedocs.core.resolver import Resolver from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.proxito.constants import RedirectType @@ -40,10 +39,6 @@ class StorageFileNotFound(Exception): class ServeDocsMixin: """Class implementing all the logic to serve a document.""" - # We force all storage calls to use internal versions - # unless explicitly set to external. - version_type = INTERNAL - def _serve_docs(self, request, project, version, filename, check_if_exists=False): """ Serve a documentation file. @@ -53,14 +48,7 @@ def _serve_docs(self, request, project, version, filename, check_if_exists=False Useful to make sure were are serving a file that exists in storage, checking if the file exists will make one additional request to the storage. """ - base_storage_path = project.get_storage_path( - type_=MEDIA_TYPE_HTML, - version_slug=version.slug, - include_file=False, - # Force to always read from the internal or extrernal storage, - # according to the current request. - version_type=self.version_type, - ) + base_storage_path = version.get_storage_path(media_type=MEDIA_TYPE_HTML) # Handle our backend storage not supporting directory indexes, # so we need to append index.html when appropriate. @@ -100,14 +88,7 @@ def _serve_dowload(self, request, project, version, type_): filename (e.g. "pip-pypa-io-en-latest.pdf" or "pip-pypi-io-en-v2.0.pdf" or "docs-celeryproject-org-kombu-en-stable.pdf"). """ - storage_path = project.get_storage_path( - type_=type_, - version_slug=version.slug, - # Force to always read from the internal or extrernal storage, - # according to the current request. - version_type=self.version_type, - include_file=True, - ) + storage_path = version.get_download_storage_path(media_type=type_) self._track_pageview( project=project, path=storage_path, diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index ad111e1125c..54b5bba194e 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -12,7 +12,6 @@ from django.views import View from readthedocs.api.mixins import CDNCacheTagsMixin -from readthedocs.builds.constants import EXTERNAL from readthedocs.builds.constants import INTERNAL from readthedocs.core.mixins import CDNCacheControlMixin from readthedocs.core.resolver import Resolver @@ -23,6 +22,7 @@ from readthedocs.core.unresolver import VersionNotFoundError from readthedocs.core.unresolver import unresolver from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING from readthedocs.projects.constants import PRIVATE from readthedocs.projects.models import Domain @@ -174,11 +174,6 @@ def _get_canonical_redirect_type(self, request): def serve_path(self, request, path): unresolved_domain = request.unresolved_domain - # We force all storage calls to use the external versions storage, - # since we are serving an external version. - if unresolved_domain.is_from_external_domain: - self.version_type = EXTERNAL - # 404 errors aren't contextualized here because all 404s use the internal nginx redirect, # where the path will be 'unresolved' again when handling the 404 error # See: ServeError404Base @@ -390,14 +385,6 @@ def get(self, request, proxito_path): structlog.contextvars.bind_contextvars(proxito_path=proxito_path) log.debug("Executing 404 handler.") unresolved_domain = request.unresolved_domain - # We force all storage calls to use the external versions storage, - # since we are serving an external version. - # The version that results from the unresolve_path() call already is - # validated to use the correct manager, this is here to add defense in - # depth against serving the wrong version. - if unresolved_domain.is_from_external_domain: - self.version_type = EXTERNAL - project = None version = None # If we weren't able to resolve a filename, @@ -557,13 +544,10 @@ def _get_custom_404_page(self, request, project, version=None): if (version_404.slug, tryfile) not in available_404_files: continue - storage_root_path = project.get_storage_path( - type_="html", - version_slug=version_404.slug, - include_file=False, - version_type=self.version_type, + storage_filename_path = version_404.get_storage_path( + media_type=MEDIA_TYPE_HTML, + filename=tryfile, ) - storage_filename_path = build_media_storage.join(storage_root_path, tryfile) log.debug( "Serving custom 404.html page.", version_slug_404=version_404.slug, diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index c8b552f204d..ca8d42e75fb 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -10,6 +10,7 @@ from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST from readthedocs.builds.models import Build, Version from readthedocs.filetreediff.dataclasses import FileTreeDiffManifest, FileTreeDiffManifestFile +from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.projects.models import HTMLFile, ImportedFile, Project from readthedocs.projects.tasks.search import index_build from readthedocs.search.documents import PageDocument @@ -61,12 +62,7 @@ def _copy_storage_dir(self, version): """Copy the test directory (rtd_tests/files) to storage""" self.storage.rclone_sync_directory( self.test_dir, - self.project.get_storage_path( - type_="html", - version_slug=version.slug, - include_file=False, - version_type=version.type, - ), + version.get_storage_path(media_type=MEDIA_TYPE_HTML), ) def test_properly_created(self): diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index c8b43178f4a..25fd61c191b 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -38,6 +38,8 @@ class ProjectMixin: def setUp(self): self.client.login(username="eric", password="test") self.pip = Project.objects.get(slug="pip") + self.pip.save() + self.version = self.pip.versions.get(slug=LATEST) # Create a External Version. ie: pull/merge request Version. self.external_version = get( Version, @@ -105,62 +107,70 @@ def test_multiple_conf_files(self, find_method): ProjectConfigurationError.MULTIPLE_CONF_FILES, ) + def test_get_storage_paths(self): + assert self.pip.get_storage_paths() == [ + "html/pip", + "pdf/pip", + "epub/pip", + "htmlzip/pip", + "json/pip", + "diff/pip", + "external/html/pip", + "external/pdf/pip", + "external/epub/pip", + "external/htmlzip/pip", + "external/json/pip", + "external/diff/pip", + ] + def test_get_storage_path(self): for type_ in MEDIA_TYPES: self.assertEqual( - self.pip.get_storage_path(type_, LATEST, include_file=False), + self.version.get_storage_path(type_), f"{type_}/pip/latest", ) self.assertEqual( - self.pip.get_storage_path(MEDIA_TYPE_PDF, LATEST), + self.version.get_download_storage_path(MEDIA_TYPE_PDF), "pdf/pip/latest/pip.pdf", ) self.assertEqual( - self.pip.get_storage_path(MEDIA_TYPE_EPUB, LATEST), + self.version.get_download_storage_path(MEDIA_TYPE_EPUB), "epub/pip/latest/pip.epub", ) self.assertEqual( - self.pip.get_storage_path(MEDIA_TYPE_HTMLZIP, LATEST), + self.version.get_download_storage_path(MEDIA_TYPE_HTMLZIP), "htmlzip/pip/latest/pip.zip", ) def test_get_storage_path_invalid_inputs(self): # Invalid type. with pytest.raises(ValueError): - self.pip.get_storage_path("foo") + self.version.get_storage_path("foo") # Trying to get a file from a non-downloadable type. with pytest.raises(ValueError): - self.pip.get_storage_path(MEDIA_TYPE_HTML, include_file=True) + self.version.get_download_storage_path(MEDIA_TYPE_HTML) # Trying path traversal. with pytest.raises(ValueError): - self.pip.get_storage_path( - MEDIA_TYPE_HTML, version_slug="../sneaky/index.html", include_file=False - ) + self.version.get_storage_path(MEDIA_TYPE_HTML, version_slug="../sneaky/index.html") def test_get_storage_path_for_external_versions(self): self.assertEqual( - self.pip.get_storage_path( + self.external_version.get_download_storage_path( "pdf", - self.external_version.slug, - version_type=self.external_version.type, ), "external/pdf/pip/99/pip.pdf", ) self.assertEqual( - self.pip.get_storage_path( + self.external_version.get_download_storage_path( "epub", - self.external_version.slug, - version_type=self.external_version.type, ), "external/epub/pip/99/pip.epub", ) self.assertEqual( - self.pip.get_storage_path( + self.external_version.get_download_storage_path( "htmlzip", - self.external_version.slug, - version_type=self.external_version.type, ), "external/htmlzip/pip/99/pip.zip", ) diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 5d500c5c0a8..c8959dde74f 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -7,6 +7,7 @@ import structlog from selectolax.parser import HTMLParser +from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.storage import build_media_storage @@ -71,13 +72,7 @@ def _get_page_content(self, page): """Gets the page content from storage.""" content = None try: - storage_path = self.project.get_storage_path( - type_="html", - version_slug=self.version.slug, - include_file=False, - version_type=self.version.type, - ) - file_path = self.storage.join(storage_path, page) + file_path = self.version.get_storage_path(media_type=MEDIA_TYPE_HTML, filename=page) with self.storage.open(file_path, mode="r") as f: content = f.read() except Exception: