@@ -379,15 +379,23 @@ def _slice(
379379 raw_relative_path = "formal/hwmcc/aiger_btor.jsonl.gz" ,
380380 notes = "Use textual renderings only; preserve solver and model-checking syntax." ,
381381 ),
382- # Package metadata
382+ # Package metadata — surfaces from issue #5061 DoD:
383+ # registry JSON, dependency-graph rows, version constraints, advisory text,
384+ # release metadata, and lockfile-like records. All entries are deterministic
385+ # file paths under raw_root; ingestion is deferred (no BigQuery, no bulk HF
386+ # mirroring). deps.dev BigQuery ingest is tracked separately per reviewer
387+ # approval to defer the BigQuery tooling.
383388 _slice (
384389 name = "deps_dev" ,
385390 family = LongTailPplFamily .PACKAGE_METADATA ,
386391 issue_number = PACKAGE_METADATA_ISSUE ,
387392 source_url = "https://docs.deps.dev/bigquery/v1/" ,
388393 surface_form = "dependency_rows" ,
389394 raw_relative_path = "packages/deps_dev/rows.jsonl.gz" ,
390- notes = "Preserve package names, semver constraints, hashes, and dependency edges." ,
395+ notes = (
396+ "Preserve package names, semver constraints, hashes, and dependency edges. "
397+ "BigQuery ingest is deferred; sample into region-local GCS with documented queries."
398+ ),
391399 ),
392400 _slice (
393401 name = "ecosystem_ms_libraries_io" ,
@@ -398,14 +406,113 @@ def _slice(
398406 raw_relative_path = "packages/ecosystems_ms/metadata.jsonl.gz" ,
399407 notes = "Keep repository/package metadata, licenses, and release records literal." ,
400408 ),
409+ _slice (
410+ name = "libraries_io_dependencies" ,
411+ family = LongTailPplFamily .PACKAGE_METADATA ,
412+ issue_number = PACKAGE_METADATA_ISSUE ,
413+ source_url = "https://libraries.io/data" ,
414+ surface_form = "dependency_edges" ,
415+ raw_relative_path = "packages/libraries_io/dependencies.jsonl.gz" ,
416+ notes = "Preserve package names, semver constraints, scopes, and platform markers per edge." ,
417+ ),
418+ _slice (
419+ name = "pypi_registry_json" ,
420+ family = LongTailPplFamily .PACKAGE_METADATA ,
421+ issue_number = PACKAGE_METADATA_ISSUE ,
422+ source_url = "https://warehouse.pypa.io/api-reference/json.html" ,
423+ surface_form = "registry_json" ,
424+ raw_relative_path = "packages/pypi/registry.jsonl.gz" ,
425+ notes = "Preserve PyPI JSON API envelopes, classifiers, requires_dist strings, and file digests." ,
426+ ),
401427 _slice (
402428 name = "npm_registry_metadata" ,
403429 family = LongTailPplFamily .PACKAGE_METADATA ,
404430 issue_number = PACKAGE_METADATA_ISSUE ,
405431 source_url = "https://docs.npmjs.com/policies/crawlers/" ,
406432 surface_form = "registry_json" ,
407433 raw_relative_path = "packages/npm/registry.jsonl.gz" ,
408- notes = "Preserve CouchDB-style package JSON and nested version fields." ,
434+ notes = "Preserve CouchDB-style package JSON, scoped names, and nested version fields." ,
435+ ),
436+ _slice (
437+ name = "crates_io_registry_json" ,
438+ family = LongTailPplFamily .PACKAGE_METADATA ,
439+ issue_number = PACKAGE_METADATA_ISSUE ,
440+ source_url = "https://doc.rust-lang.org/cargo/reference/registry-web-api.html" ,
441+ surface_form = "registry_json" ,
442+ raw_relative_path = "packages/crates_io/registry.jsonl.gz" ,
443+ notes = "Preserve crate versions, yanked flags, feature maps, and checksum fields." ,
444+ ),
445+ _slice (
446+ name = "rubygems_registry_json" ,
447+ family = LongTailPplFamily .PACKAGE_METADATA ,
448+ issue_number = PACKAGE_METADATA_ISSUE ,
449+ source_url = "https://guides.rubygems.org/rubygems-org-api/" ,
450+ surface_form = "registry_json" ,
451+ raw_relative_path = "packages/rubygems/registry.jsonl.gz" ,
452+ notes = "Keep gem dependencies, runtime/development scopes, and SHA-256 digests literal." ,
453+ ),
454+ _slice (
455+ name = "nuget_registry_index" ,
456+ family = LongTailPplFamily .PACKAGE_METADATA ,
457+ issue_number = PACKAGE_METADATA_ISSUE ,
458+ source_url = "https://learn.microsoft.com/nuget/api/registration-base-url-resource" ,
459+ surface_form = "registry_json" ,
460+ raw_relative_path = "packages/nuget/registration_index.jsonl.gz" ,
461+ notes = "Preserve NuGet registration envelopes, framework target strings, and catalog URLs." ,
462+ ),
463+ _slice (
464+ name = "maven_central_metadata" ,
465+ family = LongTailPplFamily .PACKAGE_METADATA ,
466+ issue_number = PACKAGE_METADATA_ISSUE ,
467+ source_url = "https://central.sonatype.org/search/rest-api-guide/" ,
468+ surface_form = "maven_pom_xml" ,
469+ raw_relative_path = "packages/maven/metadata.jsonl.gz" ,
470+ notes = "Preserve group/artifact/version coordinates, scopes, exclusions, and XML layout." ,
471+ ),
472+ _slice (
473+ name = "go_modules_proxy" ,
474+ family = LongTailPplFamily .PACKAGE_METADATA ,
475+ issue_number = PACKAGE_METADATA_ISSUE ,
476+ source_url = "https://proxy.golang.org/" ,
477+ surface_form = "go_mod_sum" ,
478+ raw_relative_path = "packages/go_modules/proxy.jsonl.gz" ,
479+ notes = "Keep module paths, pseudo-versions, and h1: sum strings exactly as served." ,
480+ ),
481+ _slice (
482+ name = "ghsa_advisories" ,
483+ family = LongTailPplFamily .PACKAGE_METADATA ,
484+ issue_number = PACKAGE_METADATA_ISSUE ,
485+ source_url = "https://github.com/github/advisory-database" ,
486+ surface_form = "advisory_osv_json" ,
487+ raw_relative_path = "packages/ghsa/advisories.jsonl.gz" ,
488+ notes = "Preserve GHSA IDs, CVE aliases, affected-ranges, and OSV JSON punctuation." ,
489+ ),
490+ _slice (
491+ name = "osv_advisories" ,
492+ family = LongTailPplFamily .PACKAGE_METADATA ,
493+ issue_number = PACKAGE_METADATA_ISSUE ,
494+ source_url = "https://osv.dev/" ,
495+ surface_form = "advisory_osv_json" ,
496+ raw_relative_path = "packages/osv/advisories.jsonl.gz" ,
497+ notes = "Keep OSV schema fields, semver ranges, and ecosystem tags literal." ,
498+ ),
499+ _slice (
500+ name = "npm_release_metadata" ,
501+ family = LongTailPplFamily .PACKAGE_METADATA ,
502+ issue_number = PACKAGE_METADATA_ISSUE ,
503+ source_url = "https://docs.npmjs.com/cli/v10/using-npm/registry" ,
504+ surface_form = "release_metadata" ,
505+ raw_relative_path = "packages/npm/release_metadata.jsonl.gz" ,
506+ notes = "Keep dist-tags, tarball URLs, integrity hashes, and publish timestamps per release." ,
507+ ),
508+ _slice (
509+ name = "pypi_release_metadata" ,
510+ family = LongTailPplFamily .PACKAGE_METADATA ,
511+ issue_number = PACKAGE_METADATA_ISSUE ,
512+ source_url = "https://warehouse.pypa.io/api-reference/json.html" ,
513+ surface_form = "release_metadata" ,
514+ raw_relative_path = "packages/pypi/release_metadata.jsonl.gz" ,
515+ notes = "Keep file digests, upload_time, requires_python, and yanked reason strings." ,
409516 ),
410517 _slice (
411518 name = "package_lock_corpora" ,
@@ -416,6 +523,18 @@ def _slice(
416523 raw_relative_path = "packages/package_lock/lockfiles.jsonl.gz" ,
417524 notes = "Later pipeline work should keep lockfile structure, URLs, and checksums intact." ,
418525 ),
526+ _slice (
527+ name = "the_stack_v2_lockfiles" ,
528+ family = LongTailPplFamily .PACKAGE_METADATA ,
529+ issue_number = PACKAGE_METADATA_ISSUE ,
530+ source_url = "https://huggingface.co/datasets/bigcode/the-stack-v2" ,
531+ surface_form = "lockfile" ,
532+ raw_relative_path = "packages/the_stack_v2/lockfiles.jsonl.gz" ,
533+ notes = (
534+ "Filename-filtered stopgap (package-lock.json, yarn.lock, poetry.lock, Pipfile.lock, "
535+ "Cargo.lock, Gemfile.lock, go.sum) until the corpus from #4961 lands."
536+ ),
537+ ),
419538 # Game / music
420539 _slice (
421540 name = "lichess_pgn" ,
0 commit comments