44from __future__ import annotations
55
66import dataclasses
7- import difflib
87import json
98import logging
109import os
1413import ijson .backends .python as ijson
1514
1615from pants .backend .go .go_sources .load_go_binary import LoadedGoBinaryRequest , setup_go_binary
17- from pants .backend .go .target_types import GoModTarget
1816from pants .backend .go .util_rules import pkg_analyzer
1917from pants .backend .go .util_rules .build_opts import GoBuildOptions
2018from pants .backend .go .util_rules .cgo import CGoCompilerFlags
@@ -157,15 +155,33 @@ class ModuleDescriptors:
157155
158156
159157@dataclass (frozen = True )
160- class AnalyzeThirdPartyModuleRequest :
161- go_mod_address : Address
162- go_mod_digest : Digest
163- go_mod_path : str
164- import_path : str
158+ class ModuleDownloadRequest :
159+ """Download and analyze a Go module, keyed by (name, version, minimum_go_version,
160+ build_opts, go_sum_entries).
161+
162+ This enables cross-go.mod deduplication: if mod-a and mod-b both depend on
163+ grpc@v1.60.0 with the same go.sum entries, the download and analysis only
164+ happens once because the Pants engine memoizes by the full request key.
165+
166+ ``go_sum_entries`` carries the two go.sum lines for ``<name> <version>`` and
167+ ``<name> <version>/go.mod`` extracted from the consuming go.mod's real
168+ go.sum. These entries are content-addressable by design: two well-formed
169+ go.sums MUST agree on them for the same module@version. Including them in
170+ the dedup key has two effects:
171+
172+ 1. Happy path: all consumers of module@version share one download, and the
173+ synthetic go.sum written into the sandbox lets Go perform its normal
174+ checksum verification (no GONOSUMCHECK override).
175+ 2. Tampered path: if one go.sum disagrees, the two consumers produce
176+ distinct requests -- each verified independently against its own
177+ entries -- and the tampered one fails with Go's usual SECURITY ERROR.
178+ """
179+
165180 name : str
166181 version : str
167182 minimum_go_version : str | None
168183 build_opts : GoBuildOptions
184+ go_sum_entries : tuple [str , ...]
169185
170186
171187@dataclass (frozen = True )
@@ -277,6 +293,45 @@ def strip_sandbox_prefix(path: str, marker: str) -> str:
277293 return path
278294
279295
296+ def _parse_go_sum (go_sum_content : bytes ) -> dict [tuple [str , str ], tuple [str , ...]]:
297+ """Parse a go.sum file into a dict keyed by (module name, version).
298+
299+ A well-formed go.sum has up to two lines per (module, version):
300+
301+ <name> <version> h1:<content hash>=
302+ <name> <version>/go.mod h1:<go.mod hash>=
303+
304+ Returns a dict mapping (name, version) to a tuple of the matching lines,
305+ enabling O(1) lookup per module instead of re-scanning the file.
306+ """
307+ entries : dict [tuple [str , str ], list [str ]] = {}
308+ for line in go_sum_content .decode ("utf-8" ).splitlines ():
309+ if not line :
310+ continue
311+ parts = line .split (" " , 2 )
312+ if len (parts ) < 3 :
313+ continue
314+ name = parts [0 ]
315+ version_field = parts [1 ]
316+ # Strip the "/go.mod" suffix to get the base version for grouping.
317+ version = version_field .removesuffix ("/go.mod" )
318+ key = (name , version )
319+ entries .setdefault (key , []).append (line )
320+ return {k : tuple (v ) for k , v in entries .items ()}
321+
322+
323+ def _extract_go_sum_entries_for_module (
324+ go_sum_content : bytes , name : str , version : str
325+ ) -> tuple [str , ...]:
326+ """Return the go.sum lines for a given module@version.
327+
328+ Thin wrapper around _parse_go_sum for callers that only need one module.
329+ Prefer _parse_go_sum when looking up multiple modules from the same go.sum.
330+ """
331+ parsed = _parse_go_sum (go_sum_content )
332+ return parsed .get ((name , version ), ())
333+
334+
280335def _freeze_json_dict (d : dict [Any , Any ]) -> FrozenDict [str , Any ]:
281336 result = {}
282337 for k , v in d .items ():
@@ -296,48 +351,6 @@ def _freeze_json_dict(d: dict[Any, Any]) -> FrozenDict[str, Any]:
296351 return FrozenDict (result )
297352
298353
299- async def _check_go_sum_has_not_changed (
300- input_digest : Digest ,
301- output_digest : Digest ,
302- dir_path : str ,
303- import_path : str ,
304- go_mod_address : Address ,
305- ) -> None :
306- input_entries , output_entries = await concurrently (
307- get_digest_contents (input_digest ),
308- get_digest_contents (output_digest ),
309- )
310-
311- go_sum_path = os .path .join (dir_path , "go.sum" )
312-
313- input_go_sum_entry : bytes | None = None
314- for entry in input_entries :
315- if entry .path == go_sum_path :
316- input_go_sum_entry = entry .content
317-
318- output_go_sum_entry : bytes | None = None
319- for entry in output_entries :
320- if entry .path == go_sum_path :
321- output_go_sum_entry = entry .content
322-
323- if input_go_sum_entry is not None or output_go_sum_entry is not None :
324- if input_go_sum_entry != output_go_sum_entry :
325- go_sum_diff = list (
326- difflib .unified_diff (
327- (input_go_sum_entry or b"" ).decode ().splitlines (),
328- (output_go_sum_entry or b"" ).decode ().splitlines (),
329- )
330- )
331- go_sum_diff_rendered = "\n " .join (line .rstrip () for line in go_sum_diff )
332- raise ValueError (
333- f"For `{ GoModTarget .alias } ` target `{ go_mod_address } `, the go.sum file is incomplete "
334- f"because it was updated while processing third-party dependency `{ import_path } `. "
335- "Please re-generate the go.sum file by running `go mod download all` in the module directory. "
336- "(Pants does not currently have support for updating the go.sum checksum database itself.)\n \n "
337- f"Diff:\n { go_sum_diff_rendered } "
338- )
339-
340-
341354@rule
342355async def analyze_go_third_party_package (
343356 request : AnalyzeThirdPartyPackageRequest ,
@@ -472,21 +485,46 @@ async def analyze_go_third_party_package(
472485
473486
474487@rule
475- async def analyze_go_third_party_module (
476- request : AnalyzeThirdPartyModuleRequest ,
488+ async def download_and_analyze_module (
489+ request : ModuleDownloadRequest ,
477490 analyzer : PackageAnalyzerSetup ,
478491) -> AnalyzedThirdPartyModule :
479- # Download the module.
492+ """Download and analyze a single Go module via a synthetic go.mod + go.sum.
493+
494+ Keyed by (name, version, minimum_go_version, build_opts, go_sum_entries),
495+ which lets the Pants engine deduplicate identical module downloads across
496+ go.mods.
497+
498+ A synthetic go.mod + go.sum pair is written into the sandbox so that Go's
499+ normal checksum verification still runs -- the go.sum entries come straight
500+ from the consuming go.mod's real go.sum (see ModuleDownloadRequest for the
501+ full argument for why this is safe).
502+ """
503+ # Create a synthetic go.mod (and go.sum when entries are available) that
504+ # only requires this one module. When the consuming go.sum contains the
505+ # entries for this module@version, we emit them verbatim so `go mod
506+ # download` performs its usual local checksum verification. When they
507+ # are absent (a module discovered during MVS that the consumer's go.sum
508+ # hasn't recorded yet), we omit the go.sum file and let Go fall back to
509+ # GOSUMDB (sum.golang.org by default) for verification -- the same
510+ # behavior the old per-go.mod rule effectively had.
511+ go_version = request .minimum_go_version or "1.21"
512+ synthetic_go_mod = (
513+ f"module synthetic.invalid\n \n go { go_version } \n \n require { request .name } { request .version } \n "
514+ )
515+ synthetic_files = [FileContent ("go.mod" , synthetic_go_mod .encode ())]
516+ if request .go_sum_entries :
517+ synthetic_go_sum = "\n " .join (request .go_sum_entries ) + "\n "
518+ synthetic_files .append (FileContent ("go.sum" , synthetic_go_sum .encode ()))
519+ synthetic_digest = await create_digest (CreateDigest (synthetic_files ))
520+
480521 download_result = await fallible_to_exec_result_or_raise (
481522 ** implicitly (
482523 GoSdkProcess (
483524 ("mod" , "download" , "-json" , f"{ request .name } @{ request .version } " ),
484- input_digest = request .go_mod_digest , # for go.sum
485- working_dir = os .path .dirname (request .go_mod_path ),
486- # Allow downloads of the module sources.
525+ input_digest = synthetic_digest ,
487526 allow_downloads = True ,
488527 output_directories = ("gopath" ,),
489- output_files = (os .path .join (os .path .dirname (request .go_mod_path ), "go.sum" ),),
490528 description = f"Download Go module { request .name } @{ request .version } ." ,
491529 )
492530 )
@@ -497,20 +535,10 @@ async def analyze_go_third_party_module(
497535 f"Expected output from `go mod download` for { request .name } @{ request .version } ."
498536 )
499537
500- # Make sure go.sum has not changed.
501- await _check_go_sum_has_not_changed (
502- input_digest = request .go_mod_digest ,
503- output_digest = download_result .output_digest ,
504- dir_path = os .path .dirname (request .go_mod_path ),
505- import_path = request .import_path ,
506- go_mod_address = request .go_mod_address ,
507- )
508-
509538 module_metadata = json .loads (download_result .stdout )
510539 module_sources_relpath = strip_sandbox_prefix (module_metadata ["Dir" ], "gopath/" )
511540 go_mod_relpath = strip_sandbox_prefix (module_metadata ["GoMod" ], "gopath/" )
512541
513- # Subset the output directory to just the module sources and go.mod (which may be generated).
514542 module_sources_snapshot = await digest_to_snapshot (
515543 ** implicitly (
516544 DigestSubset (
@@ -525,7 +553,6 @@ async def analyze_go_third_party_module(
525553 )
526554 )
527555
528- # Determine directories with potential Go packages in them.
529556 candidate_package_dirs = []
530557 files_by_dir = group_by_dir (
531558 p for p in module_sources_snapshot .files if p .startswith (module_sources_relpath )
@@ -535,13 +562,10 @@ async def analyze_go_third_party_module(
535562 # See https://github.com/golang/go/blob/f005df8b582658d54e63d59953201299d6fee880/src/go/build/build.go#L580-L585
536563 if "testdata" in maybe_pkg_dir .split ("/" ):
537564 continue
538-
539- # Consider directories with at least one `.go` file as package candidates.
540565 if any (f for f in files if f .endswith (".go" )):
541566 candidate_package_dirs .append (maybe_pkg_dir )
542567 candidate_package_dirs .sort ()
543568
544- # Analyze all of the packages in this module.
545569 analyzer_relpath = "__analyzer"
546570 analysis_result = await fallible_to_exec_result_or_raise (
547571 ** implicitly (
@@ -595,17 +619,32 @@ async def download_and_analyze_third_party_packages(
595619 )
596620 )
597621
622+ # Read the real go.sum once so we can extract per-module entries for the
623+ # download sandbox. This keeps Go's checksum verification intact while
624+ # allowing the engine to memoize identical module@version downloads
625+ # across different go.mods.
626+ go_sum_path = os .path .join (os .path .dirname (request .go_mod_path ), "go.sum" )
627+ digest_contents = await get_digest_contents (request .go_mod_digest )
628+ go_sum_content = b""
629+ for entry in digest_contents :
630+ if entry .path == go_sum_path :
631+ go_sum_content = entry .content
632+ break
633+
634+ # Parse the go.sum once into a dict for O(1) lookup per module.
635+ go_sum_index = _parse_go_sum (go_sum_content )
636+
637+ # The engine memoizes by (name, version, minimum_go_version, build_opts,
638+ # go_sum_entries), so identical modules across go.mods are downloaded
639+ # once -- reducing downloads from O(N*M) to O(M).
598640 analyzed_modules = await concurrently (
599- analyze_go_third_party_module (
600- AnalyzeThirdPartyModuleRequest (
601- go_mod_address = request .go_mod_address ,
602- go_mod_digest = request .go_mod_digest ,
603- go_mod_path = request .go_mod_path ,
604- import_path = mod .name ,
641+ download_and_analyze_module (
642+ ModuleDownloadRequest (
605643 name = mod .name ,
606644 version = mod .version ,
607645 minimum_go_version = mod .minimum_go_version ,
608646 build_opts = request .build_opts ,
647+ go_sum_entries = go_sum_index .get ((mod .name , mod .version ), ()),
609648 ),
610649 ** implicitly (),
611650 )
0 commit comments