Skip to content

Commit cad8d86

Browse files
rdeknijf-ewxclaude
andcommitted
backend/go: deduplicate third-party module downloads across go.mods
Replace the per-go.mod `AnalyzeThirdPartyModuleRequest` with a content-addressed `ModuleDownloadRequest` keyed on (name, version, minimum_go_version, build_opts, go_sum_entries). The Pants engine memoizes identical requests, so a module shared by N go.mods is downloaded and analyzed once instead of N times. A synthetic go.mod + go.sum pair is written into the download sandbox. The go.sum entries come from the consuming go.mod's real go.sum, keeping Go's checksum verification intact. When entries are absent (transitive modules not yet in go.sum), Go falls back to GOSUMDB. On a 3-go.mod reproducer, `pants list ::` peak memory drops from 91 GB to 32 GB (-65%) and wall time from 48s to 33s (-32%). A real 24-go.mod monorepo completes in 2.58 GB / 112s (previously OOM-killed). Partially addresses #20274. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cae762f commit cad8d86

3 files changed

Lines changed: 250 additions & 109 deletions

File tree

docs/notes/2.32.x.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ When generating lockfiles, the new `python.resolves_to_uploaded_prior_to` option
148148

149149
#### Go
150150

151+
Third-party module analysis is now deduplicated across `go.mod` files. Previously, a module required by `N` `go.mod` files was downloaded and analyzed `N` times, which caused significant memory and time overhead in monorepos with many overlapping `go.mod` files. On a 3-`go.mod` reproducer, `pants list ::` peak memory dropped from 91 GB to 32 GB (-65%). This is a no-op for repos with a single `go.mod`. See [#20274](https://github.com/pantsbuild/pants/issues/20274).
152+
151153
### Plugin API changes
152154

153155
PyO3, the interface crate between Rust and Python, has been upgraded to v0.28.3.

src/python/pants/backend/go/util_rules/third_party_pkg.py

Lines changed: 115 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from __future__ import annotations
55

66
import dataclasses
7-
import difflib
87
import json
98
import logging
109
import os
@@ -14,7 +13,6 @@
1413
import ijson.backends.python as ijson
1514

1615
from pants.backend.go.go_sources.load_go_binary import LoadedGoBinaryRequest, setup_go_binary
17-
from pants.backend.go.target_types import GoModTarget
1816
from pants.backend.go.util_rules import pkg_analyzer
1917
from pants.backend.go.util_rules.build_opts import GoBuildOptions
2018
from pants.backend.go.util_rules.cgo import CGoCompilerFlags
@@ -157,15 +155,33 @@ class ModuleDescriptors:
157155

158156

159157
@dataclass(frozen=True)
160-
class AnalyzeThirdPartyModuleRequest:
161-
go_mod_address: Address
162-
go_mod_digest: Digest
163-
go_mod_path: str
164-
import_path: str
158+
class ModuleDownloadRequest:
159+
"""Download and analyze a Go module, keyed by (name, version, minimum_go_version,
160+
build_opts, go_sum_entries).
161+
162+
This enables cross-go.mod deduplication: if mod-a and mod-b both depend on
163+
grpc@v1.60.0 with the same go.sum entries, the download and analysis only
164+
happens once because the Pants engine memoizes by the full request key.
165+
166+
``go_sum_entries`` carries the two go.sum lines for ``<name> <version>`` and
167+
``<name> <version>/go.mod`` extracted from the consuming go.mod's real
168+
go.sum. These entries are content-addressable by design: two well-formed
169+
go.sums MUST agree on them for the same module@version. Including them in
170+
the dedup key has two effects:
171+
172+
1. Happy path: all consumers of module@version share one download, and the
173+
synthetic go.sum written into the sandbox lets Go perform its normal
174+
checksum verification (no GONOSUMCHECK override).
175+
2. Tampered path: if one go.sum disagrees, the two consumers produce
176+
distinct requests -- each verified independently against its own
177+
entries -- and the tampered one fails with Go's usual SECURITY ERROR.
178+
"""
179+
165180
name: str
166181
version: str
167182
minimum_go_version: str | None
168183
build_opts: GoBuildOptions
184+
go_sum_entries: tuple[str, ...]
169185

170186

171187
@dataclass(frozen=True)
@@ -277,6 +293,45 @@ def strip_sandbox_prefix(path: str, marker: str) -> str:
277293
return path
278294

279295

296+
def _parse_go_sum(go_sum_content: bytes) -> dict[tuple[str, str], tuple[str, ...]]:
297+
"""Parse a go.sum file into a dict keyed by (module name, version).
298+
299+
A well-formed go.sum has up to two lines per (module, version):
300+
301+
<name> <version> h1:<content hash>=
302+
<name> <version>/go.mod h1:<go.mod hash>=
303+
304+
Returns a dict mapping (name, version) to a tuple of the matching lines,
305+
enabling O(1) lookup per module instead of re-scanning the file.
306+
"""
307+
entries: dict[tuple[str, str], list[str]] = {}
308+
for line in go_sum_content.decode("utf-8").splitlines():
309+
if not line:
310+
continue
311+
parts = line.split(" ", 2)
312+
if len(parts) < 3:
313+
continue
314+
name = parts[0]
315+
version_field = parts[1]
316+
# Strip the "/go.mod" suffix to get the base version for grouping.
317+
version = version_field.removesuffix("/go.mod")
318+
key = (name, version)
319+
entries.setdefault(key, []).append(line)
320+
return {k: tuple(v) for k, v in entries.items()}
321+
322+
323+
def _extract_go_sum_entries_for_module(
324+
go_sum_content: bytes, name: str, version: str
325+
) -> tuple[str, ...]:
326+
"""Return the go.sum lines for a given module@version.
327+
328+
Thin wrapper around _parse_go_sum for callers that only need one module.
329+
Prefer _parse_go_sum when looking up multiple modules from the same go.sum.
330+
"""
331+
parsed = _parse_go_sum(go_sum_content)
332+
return parsed.get((name, version), ())
333+
334+
280335
def _freeze_json_dict(d: dict[Any, Any]) -> FrozenDict[str, Any]:
281336
result = {}
282337
for k, v in d.items():
@@ -296,48 +351,6 @@ def _freeze_json_dict(d: dict[Any, Any]) -> FrozenDict[str, Any]:
296351
return FrozenDict(result)
297352

298353

299-
async def _check_go_sum_has_not_changed(
300-
input_digest: Digest,
301-
output_digest: Digest,
302-
dir_path: str,
303-
import_path: str,
304-
go_mod_address: Address,
305-
) -> None:
306-
input_entries, output_entries = await concurrently(
307-
get_digest_contents(input_digest),
308-
get_digest_contents(output_digest),
309-
)
310-
311-
go_sum_path = os.path.join(dir_path, "go.sum")
312-
313-
input_go_sum_entry: bytes | None = None
314-
for entry in input_entries:
315-
if entry.path == go_sum_path:
316-
input_go_sum_entry = entry.content
317-
318-
output_go_sum_entry: bytes | None = None
319-
for entry in output_entries:
320-
if entry.path == go_sum_path:
321-
output_go_sum_entry = entry.content
322-
323-
if input_go_sum_entry is not None or output_go_sum_entry is not None:
324-
if input_go_sum_entry != output_go_sum_entry:
325-
go_sum_diff = list(
326-
difflib.unified_diff(
327-
(input_go_sum_entry or b"").decode().splitlines(),
328-
(output_go_sum_entry or b"").decode().splitlines(),
329-
)
330-
)
331-
go_sum_diff_rendered = "\n".join(line.rstrip() for line in go_sum_diff)
332-
raise ValueError(
333-
f"For `{GoModTarget.alias}` target `{go_mod_address}`, the go.sum file is incomplete "
334-
f"because it was updated while processing third-party dependency `{import_path}`. "
335-
"Please re-generate the go.sum file by running `go mod download all` in the module directory. "
336-
"(Pants does not currently have support for updating the go.sum checksum database itself.)\n\n"
337-
f"Diff:\n{go_sum_diff_rendered}"
338-
)
339-
340-
341354
@rule
342355
async def analyze_go_third_party_package(
343356
request: AnalyzeThirdPartyPackageRequest,
@@ -472,21 +485,46 @@ async def analyze_go_third_party_package(
472485

473486

474487
@rule
475-
async def analyze_go_third_party_module(
476-
request: AnalyzeThirdPartyModuleRequest,
488+
async def download_and_analyze_module(
489+
request: ModuleDownloadRequest,
477490
analyzer: PackageAnalyzerSetup,
478491
) -> AnalyzedThirdPartyModule:
479-
# Download the module.
492+
"""Download and analyze a single Go module via a synthetic go.mod + go.sum.
493+
494+
Keyed by (name, version, minimum_go_version, build_opts, go_sum_entries),
495+
which lets the Pants engine deduplicate identical module downloads across
496+
go.mods.
497+
498+
A synthetic go.mod + go.sum pair is written into the sandbox so that Go's
499+
normal checksum verification still runs -- the go.sum entries come straight
500+
from the consuming go.mod's real go.sum (see ModuleDownloadRequest for the
501+
full argument for why this is safe).
502+
"""
503+
# Create a synthetic go.mod (and go.sum when entries are available) that
504+
# only requires this one module. When the consuming go.sum contains the
505+
# entries for this module@version, we emit them verbatim so `go mod
506+
# download` performs its usual local checksum verification. When they
507+
# are absent (a module discovered during MVS that the consumer's go.sum
508+
# hasn't recorded yet), we omit the go.sum file and let Go fall back to
509+
# GOSUMDB (sum.golang.org by default) for verification -- the same
510+
# behavior the old per-go.mod rule effectively had.
511+
go_version = request.minimum_go_version or "1.21"
512+
synthetic_go_mod = (
513+
f"module synthetic.invalid\n\ngo {go_version}\n\nrequire {request.name} {request.version}\n"
514+
)
515+
synthetic_files = [FileContent("go.mod", synthetic_go_mod.encode())]
516+
if request.go_sum_entries:
517+
synthetic_go_sum = "\n".join(request.go_sum_entries) + "\n"
518+
synthetic_files.append(FileContent("go.sum", synthetic_go_sum.encode()))
519+
synthetic_digest = await create_digest(CreateDigest(synthetic_files))
520+
480521
download_result = await fallible_to_exec_result_or_raise(
481522
**implicitly(
482523
GoSdkProcess(
483524
("mod", "download", "-json", f"{request.name}@{request.version}"),
484-
input_digest=request.go_mod_digest, # for go.sum
485-
working_dir=os.path.dirname(request.go_mod_path),
486-
# Allow downloads of the module sources.
525+
input_digest=synthetic_digest,
487526
allow_downloads=True,
488527
output_directories=("gopath",),
489-
output_files=(os.path.join(os.path.dirname(request.go_mod_path), "go.sum"),),
490528
description=f"Download Go module {request.name}@{request.version}.",
491529
)
492530
)
@@ -497,20 +535,10 @@ async def analyze_go_third_party_module(
497535
f"Expected output from `go mod download` for {request.name}@{request.version}."
498536
)
499537

500-
# Make sure go.sum has not changed.
501-
await _check_go_sum_has_not_changed(
502-
input_digest=request.go_mod_digest,
503-
output_digest=download_result.output_digest,
504-
dir_path=os.path.dirname(request.go_mod_path),
505-
import_path=request.import_path,
506-
go_mod_address=request.go_mod_address,
507-
)
508-
509538
module_metadata = json.loads(download_result.stdout)
510539
module_sources_relpath = strip_sandbox_prefix(module_metadata["Dir"], "gopath/")
511540
go_mod_relpath = strip_sandbox_prefix(module_metadata["GoMod"], "gopath/")
512541

513-
# Subset the output directory to just the module sources and go.mod (which may be generated).
514542
module_sources_snapshot = await digest_to_snapshot(
515543
**implicitly(
516544
DigestSubset(
@@ -525,7 +553,6 @@ async def analyze_go_third_party_module(
525553
)
526554
)
527555

528-
# Determine directories with potential Go packages in them.
529556
candidate_package_dirs = []
530557
files_by_dir = group_by_dir(
531558
p for p in module_sources_snapshot.files if p.startswith(module_sources_relpath)
@@ -535,13 +562,10 @@ async def analyze_go_third_party_module(
535562
# See https://github.com/golang/go/blob/f005df8b582658d54e63d59953201299d6fee880/src/go/build/build.go#L580-L585
536563
if "testdata" in maybe_pkg_dir.split("/"):
537564
continue
538-
539-
# Consider directories with at least one `.go` file as package candidates.
540565
if any(f for f in files if f.endswith(".go")):
541566
candidate_package_dirs.append(maybe_pkg_dir)
542567
candidate_package_dirs.sort()
543568

544-
# Analyze all of the packages in this module.
545569
analyzer_relpath = "__analyzer"
546570
analysis_result = await fallible_to_exec_result_or_raise(
547571
**implicitly(
@@ -595,17 +619,32 @@ async def download_and_analyze_third_party_packages(
595619
)
596620
)
597621

622+
# Read the real go.sum once so we can extract per-module entries for the
623+
# download sandbox. This keeps Go's checksum verification intact while
624+
# allowing the engine to memoize identical module@version downloads
625+
# across different go.mods.
626+
go_sum_path = os.path.join(os.path.dirname(request.go_mod_path), "go.sum")
627+
digest_contents = await get_digest_contents(request.go_mod_digest)
628+
go_sum_content = b""
629+
for entry in digest_contents:
630+
if entry.path == go_sum_path:
631+
go_sum_content = entry.content
632+
break
633+
634+
# Parse the go.sum once into a dict for O(1) lookup per module.
635+
go_sum_index = _parse_go_sum(go_sum_content)
636+
637+
# The engine memoizes by (name, version, minimum_go_version, build_opts,
638+
# go_sum_entries), so identical modules across go.mods are downloaded
639+
# once -- reducing downloads from O(N*M) to O(M).
598640
analyzed_modules = await concurrently(
599-
analyze_go_third_party_module(
600-
AnalyzeThirdPartyModuleRequest(
601-
go_mod_address=request.go_mod_address,
602-
go_mod_digest=request.go_mod_digest,
603-
go_mod_path=request.go_mod_path,
604-
import_path=mod.name,
641+
download_and_analyze_module(
642+
ModuleDownloadRequest(
605643
name=mod.name,
606644
version=mod.version,
607645
minimum_go_version=mod.minimum_go_version,
608646
build_opts=request.build_opts,
647+
go_sum_entries=go_sum_index.get((mod.name, mod.version), ()),
609648
),
610649
**implicitly(),
611650
)

0 commit comments

Comments
 (0)