testbed: target 1T tokens for the by-provenance sample

ravwojdyla · claude · ravwojdyla · commit 0c3890768f92 · 2026-04-24T16:32:20.000-07:00
Raises ``RAW_TARGET_TOTAL_TOKENS_B`` (and the baseline / variants
per-run ``TARGET_TOTAL_TOKENS_B`` constants that feed
``build_testbed_steps``) from 10 B to 1000 B — the RFC-canonical 1 T
testbed. Drops the stale ``# TODO(rav): update this to 1T`` comment
off the settings default. ``scripts/datakit/run_source_sampling.py``
stays at 10 B because it's an explicit smoke tool, not a production
entrypoint.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/experiments/datakit_testbed/baseline.py b/experiments/datakit_testbed/baseline.py
@@ -38,7 +38,7 @@
 logger = logging.getLogger(__name__)
 
 STAGING_PREFIX = "gs://marin-us-central1"
-TARGET_TOTAL_TOKENS_B = 10.0
+TARGET_TOTAL_TOKENS_B = 1000.0
 
 _SAMPLE_STEP_PREFIX = "data/datakit/"
 
diff --git a/experiments/datakit_testbed/settings.py b/experiments/datakit_testbed/settings.py
@@ -23,8 +23,7 @@
 source in the registry must either be pre-staged there or downloadable into it.
 """
 
-# TODO(rav): update this to 1T
-RAW_TARGET_TOTAL_TOKENS_B: float = 10.0
+RAW_TARGET_TOTAL_TOKENS_B: float = 1000.0
 """Target size (billions of tokens) for the pre-normalize by-provenance sample.
 
 Drives per-source sampling fractions via
diff --git a/experiments/datakit_testbed/variants.py b/experiments/datakit_testbed/variants.py
@@ -39,7 +39,7 @@
 logger = logging.getLogger(__name__)
 
 STAGING_PREFIX = "gs://marin-us-central1"
-TARGET_TOTAL_TOKENS_B = 10.0
+TARGET_TOTAL_TOKENS_B = 1000.0
 
 _SAMPLE_STEP_PREFIX = "data/datakit/"
 _FUZZY_DUPS_MAX_PARALLELISM = 128