test(integration): use local char-tokenizer fixture, drop HF Hub dependency

nv-alicheng · claude · nv-alicheng · commit 924be7a06a0d · 2026-05-12T14:26:25.000-07:00
Two integration tests in PR #306's metrics-aggregator path were flaky / slow in CI because of HuggingFace Hub: - `TestTemplateIntegration::test_template_runs` (6 cases) called `AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")` on the aggregator subprocess's startup path. Cold-cache CI runs paid the ~1 MB download + tokenizer-init cost, sometimes pushing subprocess startup past the parent launcher's 30 s timeout. Also required network egress / HF_TOKEN for some CI environments. - `test_signal_handling.py` (new tests) were not affected (they don't pass `--tokenizer`), but the parent-owns-output-dir contract from the earlier #9 follow-up also applied — those tests now create the output dir themselves before spawning the subprocess. Fix: drop in a local character-level tokenizer fixture at `tests/assets/tokenizers/char/`. ~3 KB total (`tokenizer.json` + `tokenizer_config.json`). Loaded via the existing `AutoTokenizer.from_pretrained(local_dir)` codepath — no test-only hooks in production code. Each character is one token, which is enough for the aggregator's ISL/OSL/TPOT triggers to produce deterministic counts (the e2e test path doesn't care about tokenization correctness, only that *some* count appears). Effects: no network call on the aggregator startup path for these tests, no HF_TOKEN requirement, and tokenizer load completes in single-digit ms instead of seconds. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/tests/assets/tokenizers/char/tokenizer.json b/tests/assets/tokenizers/char/tokenizer.json
@@ -0,0 +1,187 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "String": ""
+    },
+    "behavior": "Isolated",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {}
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<unk>": 0,
+      "<pad>": 1,
+      "<s>": 2,
+      "</s>": 3,
+      "a": 4,
+      "b": 5,
+      "c": 6,
+      "d": 7,
+      "e": 8,
+      "f": 9,
+      "g": 10,
+      "h": 11,
+      "i": 12,
+      "j": 13,
+      "k": 14,
+      "l": 15,
+      "m": 16,
+      "n": 17,
+      "o": 18,
+      "p": 19,
+      "q": 20,
+      "r": 21,
+      "s": 22,
+      "t": 23,
+      "u": 24,
+      "v": 25,
+      "w": 26,
+      "x": 27,
+      "y": 28,
+      "z": 29,
+      "A": 30,
+      "B": 31,
+      "C": 32,
+      "D": 33,
+      "E": 34,
+      "F": 35,
+      "G": 36,
+      "H": 37,
+      "I": 38,
+      "J": 39,
+      "K": 40,
+      "L": 41,
+      "M": 42,
+      "N": 43,
+      "O": 44,
+      "P": 45,
+      "Q": 46,
+      "R": 47,
+      "S": 48,
+      "T": 49,
+      "U": 50,
+      "V": 51,
+      "W": 52,
+      "X": 53,
+      "Y": 54,
+      "Z": 55,
+      "0": 56,
+      "1": 57,
+      "2": 58,
+      "3": 59,
+      "4": 60,
+      "5": 61,
+      "6": 62,
+      "7": 63,
+      "8": 64,
+      "9": 65,
+      " ": 66,
+      "\t": 67,
+      "\n": 68,
+      "\r": 69,
+      "!": 70,
+      "\"": 71,
+      "#": 72,
+      "$": 73,
+      "%": 74,
+      "&": 75,
+      "'": 76,
+      "(": 77,
+      ")": 78,
+      "*": 79,
+      "+": 80,
+      ",": 81,
+      "-": 82,
+      ".": 83,
+      "/": 84,
+      ":": 85,
+      ";": 86,
+      "<": 87,
+      "=": 88,
+      ">": 89,
+      "?": 90,
+      "@": 91,
+      "[": 92,
+      "\\": 93,
+      "]": 94,
+      "^": 95,
+      "_": 96,
+      "`": 97,
+      "{": 98,
+      "|": 99,
+      "}": 100,
+      "~": 101
+    },
+    "unk_token": "<unk>"
+  }
+}
diff --git a/tests/assets/tokenizers/char/tokenizer_config.json b/tests/assets/tokenizers/char/tokenizer_config.json
@@ -0,0 +1,9 @@
+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}
diff --git a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
@@ -95,6 +95,11 @@ def test_sigterm_writes_interrupted_final_snapshot(self, tmp_path: Path):
         socket_dir = tmp_path / "sockets"
         socket_dir.mkdir()
         output_dir = tmp_path / "output"
+        # The parent owns directory setup — the aggregator subprocess
+        # fail-fasts (SystemExit) on a missing output dir to surface
+        # contract violations in its own stderr instead of crashing
+        # later on the atomic-write path. Mirror that contract here.
+        output_dir.mkdir()
         # Use a unique socket name per test to avoid collisions if a
         # previous test run left an IPC file behind.
         suffix = uuid.uuid4().hex[:8]
@@ -149,6 +154,7 @@ def test_sigint_does_not_finalize_aggregator(self, tmp_path: Path):
         socket_dir = tmp_path / "sockets"
         socket_dir.mkdir()
         output_dir = tmp_path / "output"
+        output_dir.mkdir()  # parent owns dir setup (see sibling test)
         suffix = uuid.uuid4().hex[:8]
         proc = _spawn_aggregator(
             socket_dir,
diff --git a/tests/integration/commands/test_benchmark_command.py b/tests/integration/commands/test_benchmark_command.py
@@ -183,13 +183,18 @@ def test_mode_logging(self, mock_http_echo_server, ds_dataset_path, caplog):
 )
 
 
-# Non-gated tokenizer model used in place of the templates' default
-# (which references gated meta-llama/Llama-3.1-*). The echo-server e2e
-# path doesn't care about the model identity, only that the tokenizer
-# exists for the metrics aggregator's ISL/OSL/TPOT triggers. TinyLlama's
-# tokenizer is ~1MB and matches the Llama-family tokenizer the templates
-# were written against.
-_TEST_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Local character-level tokenizer fixture used in place of the templates'
+# default (which references gated `meta-llama/Llama-3.1-*`). The echo-server
+# e2e path doesn't care about the model identity, only that a tokenizer
+# loads for the metrics aggregator's ISL/OSL/TPOT triggers. Using a local
+# fixture removes the HuggingFace Hub dependency from CI: no network call,
+# no ~1 MB download, no HF_TOKEN requirement, and the load completes in
+# milliseconds rather than seconds — well inside the parent launcher's
+# readiness timeout. ``AutoTokenizer.from_pretrained`` supports local
+# directories as a first-class input, so this uses the same production
+# code path with no test-only hooks.
+_TEST_TOKENIZER_DIR = Path(__file__).resolve().parents[2] / "assets/tokenizers/char"
+_TEST_MODEL_NAME = str(_TEST_TOKENIZER_DIR)
 
 
 def _resolve_template(template_path: Path, server_url: str) -> dict: