diff --git a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/env_descr.py b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/env_descr.py index 332eac9..a1cff6e 100644 --- a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/env_descr.py +++ b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/env_descr.py @@ -38,6 +38,7 @@ _ALL_PYPI_FORMATS, FAKEURL_PATHCOMPONENT, AliasType, + _safe_netloc, arch_id, channel_from_url, correct_splitext, @@ -180,11 +181,13 @@ def make_partial_cache_url(cls, base_url: str, is_real_url: bool = True): cls._ensure_class_per_type() url = urlparse(base_url) - if is_real_url or url.netloc.split("/")[0] == FAKEURL_PATHCOMPONENT: + safe_netloc = _safe_netloc(url) + + if is_real_url or safe_netloc.split("/")[0] == FAKEURL_PATHCOMPONENT: return os.path.join( cast(str, CONDA_PACKAGES_DIRNAME), cls.TYPE, - url.netloc, + safe_netloc, url.path.lstrip("/"), ) else: @@ -192,7 +195,7 @@ def make_partial_cache_url(cls, base_url: str, is_real_url: bool = True): cast(str, CONDA_PACKAGES_DIRNAME), cls.TYPE, FAKEURL_PATHCOMPONENT, - url.netloc, + safe_netloc, url.path.lstrip("/"), ) @@ -365,11 +368,12 @@ def __init__( # If it is not a real URL, add the FAKEURL_PATHCOMPONENT but only if not # already there. url_parse_result = urlparse(self._url) - if not url_parse_result.netloc.startswith(FAKEURL_PATHCOMPONENT): + safe_netloc = _safe_netloc(url_parse_result) + if not safe_netloc.startswith(FAKEURL_PATHCOMPONENT): self._url = urlunparse( ( url_parse_result.scheme, - os.path.join(FAKEURL_PATHCOMPONENT, url_parse_result.netloc), + os.path.join(FAKEURL_PATHCOMPONENT, safe_netloc), url_parse_result.path, url_parse_result.params, url_parse_result.query, @@ -502,9 +506,10 @@ def is_fetched(self, pkg_format: str) -> bool: def is_downloadable_url(self, pkg_format: Optional[str] = None) -> bool: if not pkg_format: pkg_format = self._url_format - return pkg_format == self._url_format and not urlparse( - self._url - ).netloc.startswith(FAKEURL_PATHCOMPONENT) + url_parsed = urlparse(self._url) + return pkg_format == self._url_format and not _safe_netloc( + url_parsed + ).startswith(FAKEURL_PATHCOMPONENT) def is_derived(self) -> bool: # If the filename component of the URL does not match the filename of this package, @@ -776,7 +781,9 @@ def is_external_url(self, sources: Dict[str, List[str]]) -> bool: # This is used to determine if we should consider a package as a local package to # be downloaded before resolving the environment pypi_sources = sources.get("pypi", []) - return not any(urlparse(source).netloc in self.url for source in pypi_sources) + return not any( + (urlparse(source).hostname or "") in self.url for source in pypi_sources + ) class ResolvedEnvironment: diff --git a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/resolvers/conda_lock_resolver.py b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/resolvers/conda_lock_resolver.py index f3b87dd..57e1f1f 100644 --- a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/resolvers/conda_lock_resolver.py +++ b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/resolvers/conda_lock_resolver.py @@ -279,6 +279,7 @@ def resolve( outfile_name, "-k", "explicit", + "--strip-auth", "--conda", ] # if "micromamba_server" in self._conda._bins: @@ -358,7 +359,11 @@ def resolve( base_build_url = components[4] parse = urlparse(base_build_url) clean_path, clean_commit = parse.path.split("@") - clean_url = parse.scheme[4:] + parse.netloc + clean_path + # Use hostname (not netloc) to avoid leaking embedded credentials + safe_netloc = parse.hostname or "" + if parse.port: + safe_netloc += ":%d" % parse.port + clean_url = parse.scheme[4:] + safe_netloc + clean_path base_pkg_url = "%s/%s" % (clean_url, clean_commit) # TODO: Do we need to handle subdirectories cache_base_url = ( diff --git a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/utils.py b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/utils.py index 8b99364..bdcdbee 100644 --- a/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/utils.py +++ b/metaflow-netflixext/metaflow_extensions/nflx/plugins/conda/utils.py @@ -27,7 +27,7 @@ Union, cast, ) -from urllib.parse import unquote, urlparse +from urllib.parse import unquote, urlparse, urlunparse import metaflow.metaflow_config as mf_config from metaflow._vendor.packaging.markers import Marker, default_environment @@ -754,6 +754,34 @@ def channel_or_url(url: str) -> str: return url +def strip_url_credentials(url: str) -> str: + """Strip username:password from a URL, preserving everything else.""" + parsed = urlparse(url) + if not parsed.username: + return url + netloc = parsed.hostname or "" + if parsed.port: + netloc += ":%d" % parsed.port + return urlunparse( + ( + parsed.scheme, + netloc, + parsed.path, + parsed.params, + parsed.query, + parsed.fragment, + ) + ) + + +def _safe_netloc(url_parse_result) -> str: + """Return netloc without credentials (hostname:port only).""" + netloc = url_parse_result.hostname or "" + if url_parse_result.port: + netloc += ":%d" % url_parse_result.port + return netloc + + def auth_from_urls(urls: List[str]) -> Optional[AuthBase]: auths_per_hostname = { cast(str, h): HTTPBasicAuth(username, pwd) diff --git a/tests/plugins/conda/test_strip_url_credentials.py b/tests/plugins/conda/test_strip_url_credentials.py new file mode 100644 index 0000000..f600525 --- /dev/null +++ b/tests/plugins/conda/test_strip_url_credentials.py @@ -0,0 +1,80 @@ +"""Tests for strip_url_credentials and safe_netloc helpers in utils.py.""" + +import sys +import os + +sys.path.insert( + 0, + os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "metaflow-netflixext", + ), +) + +from urllib.parse import urlparse +from metaflow_extensions.nflx.plugins.conda.utils import ( + strip_url_credentials, + _safe_netloc, +) + + +class TestStripUrlCredentials: + def test_no_credentials(self): + url = "https://pypi.org/simple/" + assert strip_url_credentials(url) == url + + def test_user_and_password(self): + url = "https://user:token@private-pypi.example.com/simple/" + assert strip_url_credentials(url) == "https://private-pypi.example.com/simple/" + + def test_user_only(self): + url = "https://user@private-pypi.example.com/simple/" + assert strip_url_credentials(url) == "https://private-pypi.example.com/simple/" + + def test_preserves_port(self): + url = "https://user:token@private-pypi.example.com:8080/simple/" + assert ( + strip_url_credentials(url) + == "https://private-pypi.example.com:8080/simple/" + ) + + def test_preserves_query_and_fragment(self): + url = "https://user:pass@host.com/path?q=1#frag" + assert strip_url_credentials(url) == "https://host.com/path?q=1#frag" + + def test_file_url_unchanged(self): + url = "file:///tmp/packages/foo-1.0.tar.gz" + assert strip_url_credentials(url) == url + + def test_empty_string(self): + assert strip_url_credentials("") == "" + + def test_git_plus_https(self): + url = "git+https://token:x-oauth@github.com/org/repo.git@abc123" + result = strip_url_credentials(url) + assert "token" not in result + assert "x-oauth" not in result + assert "github.com" in result + + +class TestSafeNetloc: + def test_simple_host(self): + parsed = urlparse("https://pypi.org/simple/") + assert _safe_netloc(parsed) == "pypi.org" + + def test_host_with_port(self): + parsed = urlparse("https://pypi.org:8080/simple/") + assert _safe_netloc(parsed) == "pypi.org:8080" + + def test_credentials_stripped(self): + parsed = urlparse("https://user:token@private.com/simple/") + assert _safe_netloc(parsed) == "private.com" + assert "user" not in _safe_netloc(parsed) + assert "token" not in _safe_netloc(parsed) + + def test_credentials_with_port_stripped(self): + parsed = urlparse("https://user:token@private.com:443/simple/") + assert _safe_netloc(parsed) == "private.com:443"