diff --git a/scripts/dump-dependencies.py b/scripts/dump-dependencies.py new file mode 100644 index 00000000..d1abee6d --- /dev/null +++ b/scripts/dump-dependencies.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +dump-dependencies.py + +Clones (or scans a local) repo and dumps all direct AND transitive +dependencies from pyproject.toml, package.json, uv.lock, and +package-lock.json to a CSV. + +Output columns: + repo, package_name, dependency_type, dependency, version_spec, min_version + + repo : basename of the repo URL / local path + package_name : name from the manifest (project.name / package.json#name); + repo name for lockfile rows + dependency_type : pyproject: dependency | optional: | build | dev-group: + package.json: dependencies | devDependencies | peerDependencies | optionalDependencies + lockfiles: transitive-pypi | transitive-npm + dependency : normalized dependency name + version_spec : raw constraint from manifest, or pinned version from lockfile + min_version : lowest semver implied by version_spec (best-effort, may be blank) + +Usage: + python scripts/dump-dependencies.py --repo [--out FILE] +""" + +import argparse +import csv +import json +import re +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +try: + import tomllib # Python 3.11+ +except ModuleNotFoundError: + try: + import tomli as tomllib # type: ignore[no-redef] # pip install tomli + except ModuleNotFoundError: + sys.exit("Error: requires Python 3.11+ or `pip install tomli`") + +try: + from min_version import min_version +except ImportError: + from scripts.min_version import min_version # type: ignore[no-redef] + +EXCLUDE_DIRS = {"node_modules", ".venv", ".git", "__pycache__", "dist", "build"} +FIELDNAMES = ["repo", "package_name", "dependency_type", "dependency", "version_spec", "min_version"] +MANIFEST_NAMES = {"pyproject.toml", "package.json", "uv.lock", "package-lock.json"} + + +def normalize_pypi(name: str) -> str: + return re.sub(r"[-_.]+", "-", name).lower() + + +def split_pypi_dep(dep: str) -> tuple[str, str]: + """'package>=1.0,<2 ; marker' → (name, spec)""" + dep = dep.split(";")[0].strip() + m = re.match(r"^([A-Za-z0-9_.\-\[\]]+)\s*(.*)$", dep) + if m: + pkg = re.sub(r"\[.*?\]", "", m.group(1)).strip() + return normalize_pypi(pkg), m.group(2).strip() + return normalize_pypi(dep), "" + + +def row(repo: str, package_name: str, dependency_type: str, + dependency: str, version_spec: str) -> dict: + return { + "repo": repo, + "package_name": package_name, + "dependency_type": dependency_type, + "dependency": dependency, + "version_spec": version_spec, + "min_version": min_version(version_spec), + } + + +def parse_pyproject(path: Path, repo: str) -> list[dict]: + with open(path, "rb") as f: + data = tomllib.load(f) + project = data.get("project", {}) + pkg_name = normalize_pypi(project.get("name", repo)) + rows = [] + + for dep in project.get("dependencies", []): + n, s = split_pypi_dep(dep) + rows.append(row(repo, pkg_name, "dependencies", n, s)) + + for group, deps in project.get("optional-dependencies", {}).items(): + for dep in deps: + n, s = split_pypi_dep(dep) + rows.append(row(repo, pkg_name, f"optional:{group}", n, s)) + + for dep in data.get("build-system", {}).get("requires", []): + n, s = split_pypi_dep(dep) + rows.append(row(repo, pkg_name, "build", n, s)) + + for grp, deps in data.get("dependency-groups", {}).items(): + for dep in deps: + if isinstance(dep, str): + n, s = split_pypi_dep(dep) + rows.append(row(repo, pkg_name, f"dev-group:{grp}", n, s)) + + for dep in data.get("tool", {}).get("uv", {}).get("dev-dependencies", []): + n, s = split_pypi_dep(dep) + rows.append(row(repo, pkg_name, "dev-group:dev", n, s)) + + return rows + + +def parse_package_json(path: Path, repo: str) -> list[dict]: + with open(path) as f: + data = json.load(f) + pkg_name = data.get("name", repo) + rows = [] + for section in ("dependencies", "devDependencies", "peerDependencies", "optionalDependencies"): + for dep, spec in data.get(section, {}).items(): + rows.append(row(repo, pkg_name, section, dep, str(spec))) + return rows + + +def parse_uv_lock(path: Path, repo: str) -> list[dict]: + with open(path, "rb") as f: + data = tomllib.load(f) + rows = [] + for pkg in data.get("package", []): + name = normalize_pypi(pkg.get("name", "")) + version = pkg.get("version", "") + if name and version: + rows.append(row(repo, repo, "transitive-pypi", name, version)) + return rows + + +def parse_package_lock(path: Path, repo: str) -> list[dict]: + with open(path) as f: + data = json.load(f) + rows = [] + for key, info in data.get("packages", {}).items(): + if key == "": + continue + dep_name = key.removeprefix("node_modules/") + rows.append(row(repo, repo, "transitive-npm", dep_name, info.get("version", ""))) + return rows + + +def resolve_repo(repo_arg: str) -> tuple[Path, str, tempfile.TemporaryDirectory | None]: + if repo_arg.startswith(("https://", "git@", "http://", "git://", "ssh://")): + if not shutil.which("git"): + sys.exit("Error: git not found in PATH") + tmpdir = tempfile.TemporaryDirectory() + dest = Path(tmpdir.name) / "repo" + print(f"Cloning {repo_arg} …", file=sys.stderr) + subprocess.run(["git", "clone", "--depth=1", repo_arg, str(dest)], + check=True, capture_output=True) + repo_name = Path(repo_arg.rstrip("/").split("/")[-1].removesuffix(".git")).name + return dest, repo_name, tmpdir + local = Path(repo_arg).resolve() + if not local.is_dir(): + sys.exit(f"Error: not a directory: {local}") + return local, local.name, None + + +_PARSERS = { + "pyproject.toml": parse_pyproject, + "package.json": parse_package_json, + "uv.lock": parse_uv_lock, + "package-lock.json": parse_package_lock, +} + + +def collect_rows(root: Path, repo_name: str) -> list[dict]: + rows: list[dict] = [] + for path in sorted(root.rglob("*")): + if path.name not in MANIFEST_NAMES: + continue + if any(p in EXCLUDE_DIRS for p in path.parts): + continue + try: + rows.extend(_PARSERS[path.name](path, repo_name)) + except Exception as e: + print(f"Warning: could not parse {path}: {e}", file=sys.stderr) + return rows + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--repo", required=True, + help="Git URL to clone or local path to scan") + parser.add_argument("--out", type=Path, default=None, + help="Output CSV (default: -dependencies.csv)") + args = parser.parse_args() + + root, repo_name, tmpdir = resolve_repo(args.repo) + out = args.out or Path(f"{repo_name}-dependencies.csv") + + try: + rows = collect_rows(root, repo_name) + rows.sort(key=lambda r: (r["dependency_type"], r["dependency"], r["package_name"])) + + with open(out, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=FIELDNAMES) + writer.writeheader() + writer.writerows(rows) + + print(f"Wrote {len(rows)} rows to {out}") + by_type: dict[str, int] = {} + for r in rows: + by_type[r["dependency_type"]] = by_type.get(r["dependency_type"], 0) + 1 + for t, c in sorted(by_type.items()): + print(f" {t}: {c}") + finally: + if tmpdir: + tmpdir.cleanup() + + +if __name__ == "__main__": + main() diff --git a/scripts/min_version.py b/scripts/min_version.py new file mode 100644 index 00000000..60424eb8 --- /dev/null +++ b/scripts/min_version.py @@ -0,0 +1,52 @@ +""" +Best-effort extraction of the minimum semver implied by a version spec string. + +This is a separate module because the core dependency dumper doesn't need it — +the primary audit question is "do we depend on this package at all". Import this +only when you need the min_version column populated with parsed values. +""" + +import re + +_SEMVER_FINDALL = re.compile(r"(\d+\.\d+\.\d+(?:-[0-9A-Za-z.-]+)?)") +_SEMVER_EXACT = re.compile(r"^\d+\.\d+\.\d+(?:-[0-9A-Za-z.-]+)?$") +_NON_SEMVER_PREFIXES = ( + "workspace:", "file:", "link:", "portal:", "patch:", + "git:", "git+", "github:", "gitlab:", "bitbucket:", + "http://", "https://", +) + + +def _parse(s: str) -> tuple | None: + m = re.fullmatch(r"(\d+)\.(\d+)\.(\d+)(?:-([0-9A-Za-z.-]+))?", s) + if not m: + return None + return (int(m[1]), int(m[2]), int(m[3]), m[4] or "") + + +def _key(t: tuple) -> tuple: + major, minor, patch, pre = t + return (major, minor, patch, 0 if pre else 1, pre) # stable > prerelease + + +def min_version(spec: str) -> str: + """Return the lowest semver implied by spec, or '' if not determinable.""" + if not spec: + return "" + spec = spec.strip() + if any(spec.startswith(p) for p in _NON_SEMVER_PREFIXES): + return "" + if spec.startswith("npm:"): + inner = spec[4:] + at = inner.rfind("@") + if at > 0: + spec = inner[at + 1:].strip() + if spec in ("*", "x", "X", ""): + return "" + if _SEMVER_EXACT.match(spec): + return spec + candidates = [c for c in (_parse(m) for m in _SEMVER_FINDALL.findall(spec)) if c is not None] + if not candidates: + return "" + best = min(candidates, key=_key) + return "{}.{}.{}{}".format(best[0], best[1], best[2], f"-{best[3]}" if best[3] else "") diff --git a/scripts/test_dump_dependencies.py b/scripts/test_dump_dependencies.py new file mode 100644 index 00000000..a25c4605 --- /dev/null +++ b/scripts/test_dump_dependencies.py @@ -0,0 +1,324 @@ +"""Tests for dump-dependencies.py and min_version.py.""" + +import importlib.util +import json +import sys +from pathlib import Path + +import pytest + +# ── Import the hyphen-named script as a module ───────────────────────────────── +_SCRIPTS = Path(__file__).parent +sys.path.insert(0, str(_SCRIPTS)) + +spec = importlib.util.spec_from_file_location("dump_deps", _SCRIPTS / "dump-dependencies.py") +dump_deps = importlib.util.module_from_spec(spec) +spec.loader.exec_module(dump_deps) + +from min_version import min_version # noqa: E402 + +# ── min_version ──────────────────────────────────────────────────────────────── + +class TestMinVersion: + def test_exact_version(self): + assert min_version("1.2.3") == "1.2.3" + + def test_caret_range(self): + assert min_version("^3.0.0") == "3.0.0" + + def test_tilde_range(self): + assert min_version("~1.2.3") == "1.2.3" + + def test_gte_constraint(self): + assert min_version(">=2.1.0") == "2.1.0" + + def test_range_picks_lower_bound(self): + assert min_version(">=1.2.0,<2.0.0") == "1.2.0" + + def test_prerelease(self): + assert min_version("1.0.0-alpha.1") == "1.0.0-alpha.1" + + def test_prerelease_vs_stable_picks_prerelease(self): + # 1.0.0-alpha < 1.0.0 (stable) + assert min_version(">=1.0.0-alpha,<2.0.0") == "1.0.0-alpha" + + def test_npm_alias(self): + assert min_version("npm:rolldown-vite@7.1.14") == "7.1.14" + + def test_npm_scoped_alias(self): + assert min_version("npm:@scope/pkg@1.2.3") == "1.2.3" + + def test_wildcard_star(self): + assert min_version("*") == "" + + def test_wildcard_x(self): + assert min_version("x") == "" + + def test_empty(self): + assert min_version("") == "" + + def test_workspace_protocol(self): + assert min_version("workspace:*") == "" + + def test_file_protocol(self): + assert min_version("file:../foo") == "" + + def test_git_url(self): + assert min_version("git+https://github.com/org/repo") == "" + + def test_https_url(self): + assert min_version("https://example.com/pkg.tar.gz") == "" + + def test_non_semver_string(self): + assert min_version("latest") == "" + + +# ── normalize_pypi ───────────────────────────────────────────────────────────── + +class TestNormalizePypi: + def test_underscores_to_hyphens(self): + assert dump_deps.normalize_pypi("my_package") == "my-package" + + def test_dots_to_hyphens(self): + assert dump_deps.normalize_pypi("my.package") == "my-package" + + def test_mixed_separators(self): + assert dump_deps.normalize_pypi("My__Package.Name") == "my-package-name" + + def test_lowercase(self): + assert dump_deps.normalize_pypi("Django") == "django" + + def test_already_normalized(self): + assert dump_deps.normalize_pypi("requests") == "requests" + + +# ── split_pypi_dep ───────────────────────────────────────────────────────────── + +class TestSplitPypiDep: + def test_bare_package(self): + assert dump_deps.split_pypi_dep("requests") == ("requests", "") + + def test_with_version(self): + assert dump_deps.split_pypi_dep("requests>=2.0.0") == ("requests", ">=2.0.0") + + def test_with_range(self): + assert dump_deps.split_pypi_dep("django>=3.0,<4.0") == ("django", ">=3.0,<4.0") + + def test_strips_marker(self): + name, spec = dump_deps.split_pypi_dep("importlib-metadata>=1.0 ; python_version < '3.8'") + assert name == "importlib-metadata" + assert "python_version" not in spec + + def test_strips_extras(self): + name, _ = dump_deps.split_pypi_dep("uvicorn[standard]>=0.12") + assert name == "uvicorn" + + def test_normalizes_name(self): + name, _ = dump_deps.split_pypi_dep("My_Package>=1.0") + assert name == "my-package" + + +# ── parse_pyproject ──────────────────────────────────────────────────────────── + +class TestParsePyproject: + def _write(self, tmp_path, content): + p = tmp_path / "pyproject.toml" + p.write_text(content) + return p + + def test_direct_dependencies(self, tmp_path): + p = self._write(tmp_path, """ +[project] +name = "my-pkg" +dependencies = ["requests>=2.0", "click"] +""") + rows = dump_deps.parse_pyproject(p, "myrepo") + deps = {r["dependency"]: r for r in rows} + assert "requests" in deps + assert deps["requests"]["version_spec"] == ">=2.0" + assert deps["requests"]["dependency_type"] == "dependencies" + assert deps["requests"]["package_name"] == "my-pkg" + assert "click" in deps + + def test_optional_dependencies(self, tmp_path): + p = self._write(tmp_path, """ +[project] +name = "my-pkg" +[project.optional-dependencies] +memory = ["chromadb>=0.4"] +""") + rows = dump_deps.parse_pyproject(p, "myrepo") + assert any(r["dependency"] == "chromadb" and r["dependency_type"] == "optional:memory" + for r in rows) + + def test_build_dependencies(self, tmp_path): + p = self._write(tmp_path, """ +[project] +name = "my-pkg" +[build-system] +requires = ["hatchling"] +""") + rows = dump_deps.parse_pyproject(p, "myrepo") + assert any(r["dependency"] == "hatchling" and r["dependency_type"] == "build" + for r in rows) + + def test_dev_groups(self, tmp_path): + p = self._write(tmp_path, """ +[project] +name = "my-pkg" +[dependency-groups] +tests = ["pytest>=7", "pytest-cov"] +""") + rows = dump_deps.parse_pyproject(p, "myrepo") + assert any(r["dependency"] == "pytest" and r["dependency_type"] == "dev-group:tests" + for r in rows) + + def test_falls_back_to_repo_name(self, tmp_path): + p = self._write(tmp_path, "[project]\ndependencies = []\n") + rows = dump_deps.parse_pyproject(p, "myrepo") + # No deps, but should not crash and package_name defaults to repo + assert rows == [] + + def test_min_version_populated(self, tmp_path): + p = self._write(tmp_path, """ +[project] +name = "my-pkg" +dependencies = ["requests>=2.28.0"] +""") + rows = dump_deps.parse_pyproject(p, "myrepo") + assert rows[0]["min_version"] == "2.28.0" + + +# ── parse_package_json ───────────────────────────────────────────────────────── + +class TestParsePackageJson: + def _write(self, tmp_path, data): + p = tmp_path / "package.json" + p.write_text(json.dumps(data)) + return p + + def test_dependencies(self, tmp_path): + p = self._write(tmp_path, { + "name": "@myorg/pkg", + "dependencies": {"react": "^18.0.0"}, + }) + rows = dump_deps.parse_package_json(p, "myrepo") + assert any(r["dependency"] == "react" and r["dependency_type"] == "dependencies" + for r in rows) + + def test_dev_dependencies(self, tmp_path): + p = self._write(tmp_path, { + "name": "@myorg/pkg", + "devDependencies": {"vitest": "^1.0.0"}, + }) + rows = dump_deps.parse_package_json(p, "myrepo") + assert any(r["dependency"] == "vitest" and r["dependency_type"] == "devDependencies" + for r in rows) + + def test_peer_and_optional(self, tmp_path): + p = self._write(tmp_path, { + "name": "pkg", + "peerDependencies": {"react": ">=17"}, + "optionalDependencies": {"fsevents": "~2.3.2"}, + }) + rows = dump_deps.parse_package_json(p, "myrepo") + types = {r["dependency"]: r["dependency_type"] for r in rows} + assert types["react"] == "peerDependencies" + assert types["fsevents"] == "optionalDependencies" + + def test_package_name_from_json(self, tmp_path): + p = self._write(tmp_path, {"name": "@scope/my-lib", "dependencies": {"lodash": "^4.0.0"}}) + rows = dump_deps.parse_package_json(p, "myrepo") + assert rows[0]["package_name"] == "@scope/my-lib" + + def test_min_version_caret(self, tmp_path): + p = self._write(tmp_path, {"name": "pkg", "dependencies": {"lodash": "^4.17.21"}}) + rows = dump_deps.parse_package_json(p, "myrepo") + assert rows[0]["min_version"] == "4.17.21" + + +# ── parse_uv_lock ────────────────────────────────────────────────────────────── + +class TestParseUvLock: + def test_extracts_packages(self, tmp_path): + p = tmp_path / "uv.lock" + p.write_bytes(b""" +version = 1 +[[package]] +name = "requests" +version = "2.31.0" +source = { registry = "https://pypi.org/simple" } + +[[package]] +name = "urllib3" +version = "2.0.7" +source = { registry = "https://pypi.org/simple" } +""") + rows = dump_deps.parse_uv_lock(p, "myrepo") + deps = {r["dependency"]: r for r in rows} + assert "requests" in deps + assert deps["requests"]["version_spec"] == "2.31.0" + assert deps["requests"]["dependency_type"] == "transitive-pypi" + assert deps["requests"]["min_version"] == "2.31.0" + assert "urllib3" in deps + + def test_normalizes_package_name(self, tmp_path): + p = tmp_path / "uv.lock" + p.write_bytes(b""" +version = 1 +[[package]] +name = "My_Package" +version = "1.0.0" +""") + rows = dump_deps.parse_uv_lock(p, "myrepo") + assert rows[0]["dependency"] == "my-package" + + def test_skips_entries_without_version(self, tmp_path): + p = tmp_path / "uv.lock" + p.write_bytes(b""" +version = 1 +[[package]] +name = "incomplete" +""") + rows = dump_deps.parse_uv_lock(p, "myrepo") + assert rows == [] + + +# ── parse_package_lock ───────────────────────────────────────────────────────── + +class TestParsePackageLock: + def test_extracts_packages(self, tmp_path): + p = tmp_path / "package-lock.json" + p.write_text(json.dumps({ + "lockfileVersion": 3, + "packages": { + "": {"name": "root", "version": "1.0.0"}, + "node_modules/lodash": {"version": "4.17.21"}, + "node_modules/react": {"version": "18.2.0"}, + } + })) + rows = dump_deps.parse_package_lock(p, "myrepo") + deps = {r["dependency"]: r for r in rows} + assert "lodash" in deps + assert deps["lodash"]["version_spec"] == "4.17.21" + assert deps["lodash"]["dependency_type"] == "transitive-npm" + assert "react" in deps + assert "" not in deps # root entry skipped + + def test_nested_hoisted_packages(self, tmp_path): + p = tmp_path / "package-lock.json" + p.write_text(json.dumps({ + "packages": { + "node_modules/foo/node_modules/bar": {"version": "2.0.0"}, + } + })) + rows = dump_deps.parse_package_lock(p, "myrepo") + assert rows[0]["dependency"] == "foo/node_modules/bar" + + def test_min_version_from_exact(self, tmp_path): + p = tmp_path / "package-lock.json" + p.write_text(json.dumps({ + "packages": {"node_modules/zod": {"version": "3.22.4"}} + })) + rows = dump_deps.parse_package_lock(p, "myrepo") + assert rows[0]["min_version"] == "3.22.4"