Skip to content

Commit 16bc075

Browse files
[evals] expand package metadata long-tail PPL slices (#5061)
Cover the five surfaces called out in the issue DoD (registry JSON, dependency-graph rows, advisory text, release metadata, lockfile-like records) across PyPI / npm / crates.io / RubyGems / NuGet / Maven / Go plus GHSA + OSV. deps.dev BigQuery ingest is deferred per reviewer approval; entries stay metadata-only so no bulk HF mirroring or cross-region transfers are triggered by this change. Keeps these slices in the long-tail registry only; does not wire them into default_raw_validation_sets. Co-authored-by: David Hall <dlwh@users.noreply.github.com>
1 parent 7dd69d7 commit 16bc075

2 files changed

Lines changed: 148 additions & 3 deletions

File tree

experiments/evals/long_tail_ppl.py

Lines changed: 122 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -379,15 +379,23 @@ def _slice(
379379
raw_relative_path="formal/hwmcc/aiger_btor.jsonl.gz",
380380
notes="Use textual renderings only; preserve solver and model-checking syntax.",
381381
),
382-
# Package metadata
382+
# Package metadata — surfaces from issue #5061 DoD:
383+
# registry JSON, dependency-graph rows, version constraints, advisory text,
384+
# release metadata, and lockfile-like records. All entries are deterministic
385+
# file paths under raw_root; ingestion is deferred (no BigQuery, no bulk HF
386+
# mirroring). deps.dev BigQuery ingest is tracked separately per reviewer
387+
# approval to defer the BigQuery tooling.
383388
_slice(
384389
name="deps_dev",
385390
family=LongTailPplFamily.PACKAGE_METADATA,
386391
issue_number=PACKAGE_METADATA_ISSUE,
387392
source_url="https://docs.deps.dev/bigquery/v1/",
388393
surface_form="dependency_rows",
389394
raw_relative_path="packages/deps_dev/rows.jsonl.gz",
390-
notes="Preserve package names, semver constraints, hashes, and dependency edges.",
395+
notes=(
396+
"Preserve package names, semver constraints, hashes, and dependency edges. "
397+
"BigQuery ingest is deferred; sample into region-local GCS with documented queries."
398+
),
391399
),
392400
_slice(
393401
name="ecosystem_ms_libraries_io",
@@ -398,14 +406,113 @@ def _slice(
398406
raw_relative_path="packages/ecosystems_ms/metadata.jsonl.gz",
399407
notes="Keep repository/package metadata, licenses, and release records literal.",
400408
),
409+
_slice(
410+
name="libraries_io_dependencies",
411+
family=LongTailPplFamily.PACKAGE_METADATA,
412+
issue_number=PACKAGE_METADATA_ISSUE,
413+
source_url="https://libraries.io/data",
414+
surface_form="dependency_edges",
415+
raw_relative_path="packages/libraries_io/dependencies.jsonl.gz",
416+
notes="Preserve package names, semver constraints, scopes, and platform markers per edge.",
417+
),
418+
_slice(
419+
name="pypi_registry_json",
420+
family=LongTailPplFamily.PACKAGE_METADATA,
421+
issue_number=PACKAGE_METADATA_ISSUE,
422+
source_url="https://warehouse.pypa.io/api-reference/json.html",
423+
surface_form="registry_json",
424+
raw_relative_path="packages/pypi/registry.jsonl.gz",
425+
notes="Preserve PyPI JSON API envelopes, classifiers, requires_dist strings, and file digests.",
426+
),
401427
_slice(
402428
name="npm_registry_metadata",
403429
family=LongTailPplFamily.PACKAGE_METADATA,
404430
issue_number=PACKAGE_METADATA_ISSUE,
405431
source_url="https://docs.npmjs.com/policies/crawlers/",
406432
surface_form="registry_json",
407433
raw_relative_path="packages/npm/registry.jsonl.gz",
408-
notes="Preserve CouchDB-style package JSON and nested version fields.",
434+
notes="Preserve CouchDB-style package JSON, scoped names, and nested version fields.",
435+
),
436+
_slice(
437+
name="crates_io_registry_json",
438+
family=LongTailPplFamily.PACKAGE_METADATA,
439+
issue_number=PACKAGE_METADATA_ISSUE,
440+
source_url="https://doc.rust-lang.org/cargo/reference/registry-web-api.html",
441+
surface_form="registry_json",
442+
raw_relative_path="packages/crates_io/registry.jsonl.gz",
443+
notes="Preserve crate versions, yanked flags, feature maps, and checksum fields.",
444+
),
445+
_slice(
446+
name="rubygems_registry_json",
447+
family=LongTailPplFamily.PACKAGE_METADATA,
448+
issue_number=PACKAGE_METADATA_ISSUE,
449+
source_url="https://guides.rubygems.org/rubygems-org-api/",
450+
surface_form="registry_json",
451+
raw_relative_path="packages/rubygems/registry.jsonl.gz",
452+
notes="Keep gem dependencies, runtime/development scopes, and SHA-256 digests literal.",
453+
),
454+
_slice(
455+
name="nuget_registry_index",
456+
family=LongTailPplFamily.PACKAGE_METADATA,
457+
issue_number=PACKAGE_METADATA_ISSUE,
458+
source_url="https://learn.microsoft.com/nuget/api/registration-base-url-resource",
459+
surface_form="registry_json",
460+
raw_relative_path="packages/nuget/registration_index.jsonl.gz",
461+
notes="Preserve NuGet registration envelopes, framework target strings, and catalog URLs.",
462+
),
463+
_slice(
464+
name="maven_central_metadata",
465+
family=LongTailPplFamily.PACKAGE_METADATA,
466+
issue_number=PACKAGE_METADATA_ISSUE,
467+
source_url="https://central.sonatype.org/search/rest-api-guide/",
468+
surface_form="maven_pom_xml",
469+
raw_relative_path="packages/maven/metadata.jsonl.gz",
470+
notes="Preserve group/artifact/version coordinates, scopes, exclusions, and XML layout.",
471+
),
472+
_slice(
473+
name="go_modules_proxy",
474+
family=LongTailPplFamily.PACKAGE_METADATA,
475+
issue_number=PACKAGE_METADATA_ISSUE,
476+
source_url="https://proxy.golang.org/",
477+
surface_form="go_mod_sum",
478+
raw_relative_path="packages/go_modules/proxy.jsonl.gz",
479+
notes="Keep module paths, pseudo-versions, and h1: sum strings exactly as served.",
480+
),
481+
_slice(
482+
name="ghsa_advisories",
483+
family=LongTailPplFamily.PACKAGE_METADATA,
484+
issue_number=PACKAGE_METADATA_ISSUE,
485+
source_url="https://github.com/github/advisory-database",
486+
surface_form="advisory_osv_json",
487+
raw_relative_path="packages/ghsa/advisories.jsonl.gz",
488+
notes="Preserve GHSA IDs, CVE aliases, affected-ranges, and OSV JSON punctuation.",
489+
),
490+
_slice(
491+
name="osv_advisories",
492+
family=LongTailPplFamily.PACKAGE_METADATA,
493+
issue_number=PACKAGE_METADATA_ISSUE,
494+
source_url="https://osv.dev/",
495+
surface_form="advisory_osv_json",
496+
raw_relative_path="packages/osv/advisories.jsonl.gz",
497+
notes="Keep OSV schema fields, semver ranges, and ecosystem tags literal.",
498+
),
499+
_slice(
500+
name="npm_release_metadata",
501+
family=LongTailPplFamily.PACKAGE_METADATA,
502+
issue_number=PACKAGE_METADATA_ISSUE,
503+
source_url="https://docs.npmjs.com/cli/v10/using-npm/registry",
504+
surface_form="release_metadata",
505+
raw_relative_path="packages/npm/release_metadata.jsonl.gz",
506+
notes="Keep dist-tags, tarball URLs, integrity hashes, and publish timestamps per release.",
507+
),
508+
_slice(
509+
name="pypi_release_metadata",
510+
family=LongTailPplFamily.PACKAGE_METADATA,
511+
issue_number=PACKAGE_METADATA_ISSUE,
512+
source_url="https://warehouse.pypa.io/api-reference/json.html",
513+
surface_form="release_metadata",
514+
raw_relative_path="packages/pypi/release_metadata.jsonl.gz",
515+
notes="Keep file digests, upload_time, requires_python, and yanked reason strings.",
409516
),
410517
_slice(
411518
name="package_lock_corpora",
@@ -416,6 +523,18 @@ def _slice(
416523
raw_relative_path="packages/package_lock/lockfiles.jsonl.gz",
417524
notes="Later pipeline work should keep lockfile structure, URLs, and checksums intact.",
418525
),
526+
_slice(
527+
name="the_stack_v2_lockfiles",
528+
family=LongTailPplFamily.PACKAGE_METADATA,
529+
issue_number=PACKAGE_METADATA_ISSUE,
530+
source_url="https://huggingface.co/datasets/bigcode/the-stack-v2",
531+
surface_form="lockfile",
532+
raw_relative_path="packages/the_stack_v2/lockfiles.jsonl.gz",
533+
notes=(
534+
"Filename-filtered stopgap (package-lock.json, yarn.lock, poetry.lock, Pipfile.lock, "
535+
"Cargo.lock, Gemfile.lock, go.sum) until the corpus from #4961 lands."
536+
),
537+
),
419538
# Game / music
420539
_slice(
421540
name="lichess_pgn",

tests/evals/test_long_tail_ppl.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
from experiments.evals.long_tail_ppl import (
77
GAME_MUSIC_ISSUE,
8+
PACKAGE_METADATA_ISSUE,
89
LongTailPplFamily,
10+
long_tail_ppl_slices,
911
long_tail_raw_validation_sets,
1012
render_long_tail_ppl_registry_markdown,
1113
)
@@ -47,6 +49,30 @@ def test_hf_backed_raw_dataset_preserves_requested_split():
4749
assert component.source.splits == ["test"]
4850

4951

52+
def test_package_metadata_family_covers_dod_surfaces():
53+
"""Issue #5061 DoD requires slices for registry JSON, dependency graph rows,
54+
advisory text, release metadata, and lockfile-like records. Verify each surface
55+
is registered at least once under the PACKAGE_METADATA family."""
56+
57+
slices = long_tail_ppl_slices(family=LongTailPplFamily.PACKAGE_METADATA)
58+
surfaces = {slice_.surface_form for slice_ in slices}
59+
60+
required_surfaces = {
61+
"registry_json",
62+
"dependency_rows",
63+
"dependency_edges",
64+
"advisory_osv_json",
65+
"release_metadata",
66+
"lockfile",
67+
}
68+
missing = required_surfaces - surfaces
69+
assert not missing, f"PACKAGE_METADATA family is missing DoD surfaces: {missing}"
70+
71+
for slice_ in slices:
72+
assert slice_.issue_number == PACKAGE_METADATA_ISSUE
73+
assert slice_.raw_relative_path.startswith("packages/")
74+
75+
5076
def test_file_backed_raw_dataset_rejects_non_validation_split():
5177
with pytest.raises(ValueError, match="Hugging Face dataset sources"):
5278
raw_text_dataset("gs://example-bucket/eval.jsonl", split="test")

0 commit comments

Comments
 (0)